diff --git a/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json b/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..67737072f97d4cd7de1802b35a96280c775c5059 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-192/trainer_state.json @@ -0,0 +1,5986 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.08831646734130635, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16233.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5701.859375, + "completions/mean_terminated_length": 5701.859375, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.35103847086429596, + "epoch": 0.00045998160073597056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0027150087989866734, + "learning_rate": 1e-05, + "loss": 0.0764, + "num_tokens": 372903.0, + "reward": 0.71875, + "reward_std": 0.4581822156906128, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 0.4531635046005249, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000617504119873, + "sampling/importance_sampling_ratio/min": 0.2750210464000702, + "sampling/sampling_logp_difference/max": 1.290907621383667, + "sampling/sampling_logp_difference/mean": 0.01358163170516491, + "step": 1 + }, + { + "clip_ratio/high_max": 0.00010992912939400412, + "clip_ratio/high_mean": 2.748228234850103e-05, + "clip_ratio/low_mean": 0.00016060493635450257, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001880872223409824, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 7385.90625, + "completions/mean_terminated_length": 6455.06884765625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.5675897598266602, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007440462941303849, + "learning_rate": 1e-05, + "loss": -0.0152, + "num_tokens": 856873.0, + "reward": 0.390625, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 0.009396589361131191, + "sampling/sampling_logp_difference/max": 4.667408466339111, + "sampling/sampling_logp_difference/mean": 0.022290317341685295, + "step": 2 + }, + { + "clip_ratio/high_max": 0.00018680206630961038, + "clip_ratio/high_mean": 7.093910403455084e-05, + "clip_ratio/low_mean": 0.0002504906224203296, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032142972168003325, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15091.0, + "completions/mean_length": 5608.828125, + "completions/mean_terminated_length": 5437.7939453125, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "entropy": 0.44635456055402756, + "epoch": 0.0013799448022079118, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002476191846653819, + "learning_rate": 1e-05, + "loss": 0.0755, + "num_tokens": 1225782.0, + "reward": 0.578125, + "reward_std": 0.3776973485946655, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.16118201613426208, + "sampling/sampling_logp_difference/max": 1.825221061706543, + "sampling/sampling_logp_difference/mean": 0.017525848001241684, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0002787337944027968, + "clip_ratio/high_mean": 8.000510115380166e-05, + "clip_ratio/low_mean": 0.00027736531956179533, + "clip_ratio/low_min": 2.338634294574149e-05, + "clip_ratio/region_mean": 0.0003573704316295334, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14515.0, + "completions/max_terminated_length": 14515.0, + "completions/mean_length": 3346.078125, + "completions/mean_terminated_length": 3346.078125, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.545745424926281, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0037713816855102777, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 1453315.0, + "reward": 0.4375, + "reward_std": 0.4413174092769623, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000426769256592, + "sampling/importance_sampling_ratio/min": 0.08748604357242584, + "sampling/sampling_logp_difference/max": 2.4362759590148926, + "sampling/sampling_logp_difference/mean": 0.016878074035048485, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0002736507922236342, + "clip_ratio/high_mean": 0.00012070279444742482, + "clip_ratio/low_mean": 0.00037263989906932693, + "clip_ratio/low_min": 7.880559132900089e-05, + "clip_ratio/region_mean": 0.0004933426898787729, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15992.0, + "completions/mean_length": 7791.578125, + "completions/mean_terminated_length": 5601.35302734375, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 0.4527555741369724, + "epoch": 0.0022999080036798527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0019191562896594405, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 1962144.0, + "reward": 0.484375, + "reward_std": 0.4987064301967621, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 0.09914527088403702, + "sampling/sampling_logp_difference/max": 2.311169147491455, + "sampling/sampling_logp_difference/mean": 0.019328925758600235, + "step": 5 + }, + { + "clip_ratio/high_max": 0.000247960046181106, + "clip_ratio/high_mean": 6.500758581751143e-05, + "clip_ratio/low_mean": 8.249791471826029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014750550326425582, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15770.0, + "completions/mean_length": 4726.546875, + "completions/mean_terminated_length": 4350.5, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.5126069597899914, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002579454565420747, + "learning_rate": 1e-05, + "loss": -0.0359, + "num_tokens": 2273043.0, + "reward": 0.484375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999161958694458, + "sampling/importance_sampling_ratio/min": 0.0002888332528527826, + "sampling/sampling_logp_difference/max": 8.14966106414795, + "sampling/sampling_logp_difference/mean": 0.01803017407655716, + "step": 6 + }, + { + "clip_ratio/high_max": 0.00017989838943321956, + "clip_ratio/high_mean": 6.093144725127786e-05, + "clip_ratio/low_mean": 0.00028579145509866066, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003467229043963016, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12743.0, + "completions/mean_length": 7409.0625, + "completions/mean_terminated_length": 6480.62060546875, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "entropy": 0.494194608181715, + "epoch": 0.003219871205151794, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002430765191093087, + "learning_rate": 1e-05, + "loss": 0.0822, + "num_tokens": 2757655.0, + "reward": 0.46875, + "reward_std": 0.40715816617012024, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 0.17787444591522217, + "sampling/sampling_logp_difference/max": 1.726677417755127, + "sampling/sampling_logp_difference/mean": 0.019815418869256973, + "step": 7 + }, + { + "clip_ratio/high_max": 0.00017167176974908216, + "clip_ratio/high_mean": 6.041262804501457e-05, + "clip_ratio/low_mean": 0.0002822945152729517, + "clip_ratio/low_min": 5.028157829656266e-05, + "clip_ratio/region_mean": 0.00034270713513251394, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13078.0, + "completions/mean_length": 4700.203125, + "completions/mean_terminated_length": 4323.30615234375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.39490213245153427, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0022012051194906235, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 3072436.0, + "reward": 0.609375, + "reward_std": 0.49446311593055725, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998409152030945, + "sampling/importance_sampling_ratio/min": 0.06603337824344635, + "sampling/sampling_logp_difference/max": 2.717594861984253, + "sampling/sampling_logp_difference/mean": 0.016631681472063065, + "step": 8 + }, + { + "clip_ratio/high_max": 0.00013108045459375717, + "clip_ratio/high_mean": 4.318108904044493e-05, + "clip_ratio/low_mean": 0.00023819861780793872, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002813797018461628, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5188.890625, + "completions/mean_terminated_length": 4827.7578125, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.43566014245152473, + "epoch": 0.004139834406623735, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0016241734847426414, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 3414085.0, + "reward": 0.59375, + "reward_std": 0.39820659160614014, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 1.9456381797790527, + "sampling/importance_sampling_ratio/mean": 1.0000399351119995, + "sampling/importance_sampling_ratio/min": 0.10360148549079895, + "sampling/sampling_logp_difference/max": 2.2672035694122314, + "sampling/sampling_logp_difference/mean": 0.01550372689962387, + "step": 9 + }, + { + "clip_ratio/high_max": 0.00010115922304976266, + "clip_ratio/high_mean": 2.5289805762440665e-05, + "clip_ratio/low_mean": 0.00034295484147151, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003682446440507192, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 5832.875, + "completions/mean_terminated_length": 5492.51611328125, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "entropy": 0.600818321108818, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0010776554699987173, + "learning_rate": 1e-05, + "loss": -0.0314, + "num_tokens": 3798397.0, + "reward": 0.328125, + "reward_std": 0.37298911809921265, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999531507492065, + "sampling/importance_sampling_ratio/min": 0.0953303873538971, + "sampling/sampling_logp_difference/max": 2.3504066467285156, + "sampling/sampling_logp_difference/mean": 0.020683372393250465, + "step": 10 + }, + { + "clip_ratio/high_max": 0.00030824893383396557, + "clip_ratio/high_mean": 0.00011632417340479151, + "clip_ratio/low_mean": 0.0002341717704439361, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003504959422571119, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15746.0, + "completions/max_terminated_length": 15746.0, + "completions/mean_length": 4986.171875, + "completions/mean_terminated_length": 4986.171875, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.40387310832738876, + "epoch": 0.005059797608095676, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003584277583286166, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 4127424.0, + "reward": 0.671875, + "reward_std": 0.4434390664100647, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998393654823303, + "sampling/importance_sampling_ratio/min": 0.02629905194044113, + "sampling/sampling_logp_difference/max": 3.6382224559783936, + "sampling/sampling_logp_difference/mean": 0.01555373053997755, + "step": 11 + }, + { + "clip_ratio/high_max": 0.00013135069002601085, + "clip_ratio/high_mean": 4.189404148746689e-05, + "clip_ratio/low_mean": 0.00014246321052269195, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018435725178278517, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10079.0, + "completions/max_terminated_length": 10079.0, + "completions/mean_length": 3880.515625, + "completions/mean_terminated_length": 3880.515625, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "entropy": 0.4064784087240696, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017852422315627337, + "learning_rate": 1e-05, + "loss": 0.0198, + "num_tokens": 4384473.0, + "reward": 0.671875, + "reward_std": 0.2867126166820526, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999723434448242, + "sampling/importance_sampling_ratio/min": 0.37220701575279236, + "sampling/sampling_logp_difference/max": 0.9883050918579102, + "sampling/sampling_logp_difference/mean": 0.013887828215956688, + "step": 12 + }, + { + "clip_ratio/high_max": 0.00014981444019213086, + "clip_ratio/high_mean": 4.5794572770319064e-05, + "clip_ratio/low_mean": 0.00040218312869910733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00044797768418902706, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16341.0, + "completions/mean_length": 8894.578125, + "completions/mean_terminated_length": 7669.0361328125, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.5499315299093723, + "epoch": 0.005979760809567618, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004000168293714523, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 4963350.0, + "reward": 0.390625, + "reward_std": 0.2824692726135254, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999991774559021, + "sampling/importance_sampling_ratio/min": 0.047493718564510345, + "sampling/sampling_logp_difference/max": 3.0471577644348145, + "sampling/sampling_logp_difference/mean": 0.02204228937625885, + "step": 13 + }, + { + "clip_ratio/high_max": 0.00018746273144643055, + "clip_ratio/high_mean": 5.583179722634668e-05, + "clip_ratio/low_mean": 0.0001284618601857801, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001842936590037425, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12058.0, + "completions/max_terminated_length": 12058.0, + "completions/mean_length": 4584.0625, + "completions/mean_terminated_length": 4584.0625, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.4566480815410614, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003257408272475004, + "learning_rate": 1e-05, + "loss": -0.0342, + "num_tokens": 5266274.0, + "reward": 0.671875, + "reward_std": 0.3751009702682495, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750256538391, + "sampling/importance_sampling_ratio/min": 0.39602163434028625, + "sampling/sampling_logp_difference/max": 0.9262864589691162, + "sampling/sampling_logp_difference/mean": 0.01598881185054779, + "step": 14 + }, + { + "clip_ratio/high_max": 0.00015991039845175692, + "clip_ratio/high_mean": 5.3697508178629505e-05, + "clip_ratio/low_mean": 0.0003120610426776693, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00036575855119735934, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15009.0, + "completions/mean_length": 5134.671875, + "completions/mean_terminated_length": 4581.42578125, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.41497115045785904, + "epoch": 0.0068997240110395585, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004677772056311369, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 5603925.0, + "reward": 0.640625, + "reward_std": 0.3913571238517761, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001789331436157, + "sampling/importance_sampling_ratio/min": 0.07364130765199661, + "sampling/sampling_logp_difference/max": 2.608549118041992, + "sampling/sampling_logp_difference/mean": 0.016165096312761307, + "step": 15 + }, + { + "clip_ratio/high_max": 0.00025949142946046777, + "clip_ratio/high_mean": 9.68364292930346e-05, + "clip_ratio/low_mean": 0.000282365266684792, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000379201697796816, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15173.0, + "completions/max_terminated_length": 15173.0, + "completions/mean_length": 4904.96875, + "completions/mean_terminated_length": 4904.96875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.4841916747391224, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002402309561148286, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 5928091.0, + "reward": 0.484375, + "reward_std": 0.41246524453163147, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999504685401917, + "sampling/importance_sampling_ratio/min": 0.0037722671404480934, + "sampling/sampling_logp_difference/max": 5.580079078674316, + "sampling/sampling_logp_difference/mean": 0.018390391021966934, + "step": 16 + }, + { + "clip_ratio/high_max": 6.219606439117342e-05, + "clip_ratio/high_mean": 1.5549016097793356e-05, + "clip_ratio/low_mean": 0.00019023374534299364, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002057827605312923, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13837.0, + "completions/mean_length": 5209.84375, + "completions/mean_terminated_length": 3837.578857421875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3513585068285465, + "epoch": 0.0078196872125115, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019373978720977902, + "learning_rate": 1e-05, + "loss": 0.0016, + "num_tokens": 6271057.0, + "reward": 0.453125, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999862015247345, + "sampling/importance_sampling_ratio/min": 0.1450539529323578, + "sampling/sampling_logp_difference/max": 1.9306495189666748, + "sampling/sampling_logp_difference/mean": 0.013681268319487572, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0001431612308806507, + "clip_ratio/high_mean": 4.711323526862543e-05, + "clip_ratio/low_mean": 9.270217788071022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001398154154230724, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9328.0, + "completions/max_terminated_length": 9328.0, + "completions/mean_length": 2520.640625, + "completions/mean_terminated_length": 2520.640625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.36302734911441803, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027223003562539816, + "learning_rate": 1e-05, + "loss": -0.0416, + "num_tokens": 6441562.0, + "reward": 0.65625, + "reward_std": 0.33090677857398987, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000017762184143, + "sampling/importance_sampling_ratio/min": 0.3734391927719116, + "sampling/sampling_logp_difference/max": 0.9850001335144043, + "sampling/sampling_logp_difference/mean": 0.011676793918013573, + "step": 18 + }, + { + "clip_ratio/high_max": 0.00017718410344969016, + "clip_ratio/high_mean": 5.833459545101505e-05, + "clip_ratio/low_mean": 0.0002528423356125131, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00031117693106352817, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15108.0, + "completions/mean_length": 4240.96875, + "completions/mean_terminated_length": 4048.222412109375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.3896213509142399, + "epoch": 0.008739650413983441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002503112656995654, + "learning_rate": 1e-05, + "loss": 0.0739, + "num_tokens": 6721568.0, + "reward": 0.59375, + "reward_std": 0.4991811513900757, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.10363919287919998, + "sampling/sampling_logp_difference/max": 2.2668397426605225, + "sampling/sampling_logp_difference/mean": 0.014314994215965271, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0002049997847279883, + "clip_ratio/high_mean": 6.95637043008901e-05, + "clip_ratio/low_mean": 0.00011690972041833447, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018647342039912473, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15322.0, + "completions/mean_length": 3738.484375, + "completions/mean_terminated_length": 3116.573486328125, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.29045598581433296, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002947593806311488, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 6969399.0, + "reward": 0.8125, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.39339789748191833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998925924301147, + "sampling/importance_sampling_ratio/min": 0.11472277343273163, + "sampling/sampling_logp_difference/max": 2.165236711502075, + "sampling/sampling_logp_difference/mean": 0.011310569941997528, + "step": 20 + }, + { + "clip_ratio/high_max": 0.00010545731220190646, + "clip_ratio/high_mean": 3.014280719071394e-05, + "clip_ratio/low_mean": 0.00011199774735359824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014214055443062534, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15392.0, + "completions/mean_length": 6065.90625, + "completions/mean_terminated_length": 5191.49169921875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.44125597178936005, + "epoch": 0.009659613615455382, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0011246009962633252, + "learning_rate": 1e-05, + "loss": 0.0021, + "num_tokens": 7365937.0, + "reward": 0.421875, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000147819519043, + "sampling/importance_sampling_ratio/min": 0.25809481739997864, + "sampling/sampling_logp_difference/max": 1.3544282913208008, + "sampling/sampling_logp_difference/mean": 0.017348822206258774, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0003601935495680664, + "clip_ratio/high_mean": 9.941099415300414e-05, + "clip_ratio/low_mean": 0.00034870224044425413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004481132409637212, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10951.0, + "completions/mean_length": 3722.015625, + "completions/mean_terminated_length": 3521.031982421875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.4340820461511612, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001601650146767497, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 7615658.0, + "reward": 0.5, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998630285263062, + "sampling/importance_sampling_ratio/min": 1.3064802715234691e-06, + "sampling/sampling_logp_difference/max": 13.548173904418945, + "sampling/sampling_logp_difference/mean": 0.016604293137788773, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0002349931419303175, + "clip_ratio/high_mean": 6.471897268056637e-05, + "clip_ratio/low_mean": 0.00014105365880823229, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020577262966980925, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15141.0, + "completions/max_terminated_length": 15141.0, + "completions/mean_length": 3747.484375, + "completions/mean_terminated_length": 3747.484375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.43806017562747, + "epoch": 0.010579576816927323, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017510901670902967, + "learning_rate": 1e-05, + "loss": -0.0391, + "num_tokens": 7867545.0, + "reward": 0.5625, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000494718551636, + "sampling/importance_sampling_ratio/min": 0.1432838886976242, + "sampling/sampling_logp_difference/max": 1.942927360534668, + "sampling/sampling_logp_difference/mean": 0.015971330925822258, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0002638470396050252, + "clip_ratio/high_mean": 8.973176045401487e-05, + "clip_ratio/low_mean": 0.0001654990855968208, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002552308424128569, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15089.0, + "completions/mean_length": 4868.609375, + "completions/mean_terminated_length": 4685.82568359375, + "completions/min_length": 1304.0, + "completions/min_terminated_length": 1304.0, + "entropy": 0.3689058944582939, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025512739084661007, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 8187720.0, + "reward": 0.625, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942779541016, + "sampling/importance_sampling_ratio/min": 0.21243424713611603, + "sampling/sampling_logp_difference/max": 1.5491228103637695, + "sampling/sampling_logp_difference/mean": 0.01530374214053154, + "step": 24 + }, + { + "clip_ratio/high_max": 0.00016221465284615988, + "clip_ratio/high_mean": 5.93273357480939e-05, + "clip_ratio/low_mean": 0.0003561860394256655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00041551337380951736, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16105.0, + "completions/mean_length": 7169.59375, + "completions/mean_terminated_length": 7023.33349609375, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.5559867396950722, + "epoch": 0.011499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0009040784207172692, + "learning_rate": 1e-05, + "loss": 0.0516, + "num_tokens": 8657286.0, + "reward": 0.328125, + "reward_std": 0.2414703518152237, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.244469553232193, + "sampling/sampling_logp_difference/max": 1.4086644649505615, + "sampling/sampling_logp_difference/mean": 0.021266434341669083, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0001577084094606107, + "clip_ratio/high_mean": 4.298096519050887e-05, + "clip_ratio/low_mean": 0.00013108373877912527, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001740647035148868, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6734.921875, + "completions/mean_terminated_length": 6091.650390625, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.44154683500528336, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002558791544288397, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 9099577.0, + "reward": 0.515625, + "reward_std": 0.2777610719203949, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955296516418, + "sampling/importance_sampling_ratio/min": 0.077813521027565, + "sampling/sampling_logp_difference/max": 2.5534400939941406, + "sampling/sampling_logp_difference/mean": 0.0186590775847435, + "step": 26 + }, + { + "clip_ratio/high_max": 0.00014542990538757294, + "clip_ratio/high_mean": 3.6357476346893236e-05, + "clip_ratio/low_mean": 0.00021458245646499563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025093993099289946, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15505.0, + "completions/mean_length": 4848.078125, + "completions/mean_terminated_length": 4475.95166015625, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.4912428632378578, + "epoch": 0.012419503219871205, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017661909805610776, + "learning_rate": 1e-05, + "loss": 0.0957, + "num_tokens": 9420006.0, + "reward": 0.515625, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000015139579773, + "sampling/importance_sampling_ratio/min": 0.14381231367588043, + "sampling/sampling_logp_difference/max": 1.9392461776733398, + "sampling/sampling_logp_difference/mean": 0.017206422984600067, + "step": 27 + }, + { + "clip_ratio/high_max": 0.00031798147210793104, + "clip_ratio/high_mean": 0.00010812525488290703, + "clip_ratio/low_mean": 0.00021282920124576776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032095445021695923, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15477.0, + "completions/mean_length": 5689.8125, + "completions/mean_terminated_length": 5163.86865234375, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.4508574977517128, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0030540244188159704, + "learning_rate": 1e-05, + "loss": 0.0809, + "num_tokens": 9793746.0, + "reward": 0.53125, + "reward_std": 0.42552614212036133, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999897480010986, + "sampling/importance_sampling_ratio/min": 8.414050967076037e-07, + "sampling/sampling_logp_difference/max": 13.988192558288574, + "sampling/sampling_logp_difference/mean": 0.016547517850995064, + "step": 28 + }, + { + "clip_ratio/high_max": 0.00019940425045206212, + "clip_ratio/high_mean": 5.6281104662048165e-05, + "clip_ratio/low_mean": 0.00010776506042020628, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016404616417275975, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14758.0, + "completions/max_terminated_length": 14758.0, + "completions/mean_length": 3069.78125, + "completions/mean_terminated_length": 3069.78125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.39274851977825165, + "epoch": 0.013339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034625211264938116, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 10000348.0, + "reward": 0.546875, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000138282775879, + "sampling/importance_sampling_ratio/min": 0.32597410678863525, + "sampling/sampling_logp_difference/max": 1.1209373474121094, + "sampling/sampling_logp_difference/mean": 0.014218954369425774, + "step": 29 + }, + { + "clip_ratio/high_max": 0.00012761429206875619, + "clip_ratio/high_mean": 4.307139124648529e-05, + "clip_ratio/low_mean": 0.00010018590637628222, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014325729807751486, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 5308.3125, + "completions/mean_terminated_length": 4763.6064453125, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.50441013276577, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00156789505854249, + "learning_rate": 1e-05, + "loss": 0.0046, + "num_tokens": 10350440.0, + "reward": 0.515625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000964403152466, + "sampling/importance_sampling_ratio/min": 0.04705130681395531, + "sampling/sampling_logp_difference/max": 3.056516647338867, + "sampling/sampling_logp_difference/mean": 0.019430290907621384, + "step": 30 + }, + { + "clip_ratio/high_max": 0.00016632911138003692, + "clip_ratio/high_mean": 5.557040094572585e-05, + "clip_ratio/low_mean": 0.0002778837697405834, + "clip_ratio/low_min": 1.6620682799839415e-05, + "clip_ratio/region_mean": 0.00033345417978125624, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15490.0, + "completions/mean_length": 6388.265625, + "completions/mean_terminated_length": 5354.22412109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.5342313349246979, + "epoch": 0.014259429622815088, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026365246158093214, + "learning_rate": 1e-05, + "loss": 0.0118, + "num_tokens": 10768153.0, + "reward": 0.359375, + "reward_std": 0.31983357667922974, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998627305030823, + "sampling/importance_sampling_ratio/min": 0.26772308349609375, + "sampling/sampling_logp_difference/max": 1.31780207157135, + "sampling/sampling_logp_difference/mean": 0.017920637503266335, + "step": 31 + }, + { + "clip_ratio/high_max": 0.00017989536627283087, + "clip_ratio/high_mean": 5.500852148543345e-05, + "clip_ratio/low_mean": 0.00012964008692506468, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018464860841049813, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14732.0, + "completions/mean_length": 5229.078125, + "completions/mean_terminated_length": 4869.24169921875, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.38906631618738174, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022169759031385183, + "learning_rate": 1e-05, + "loss": 0.0213, + "num_tokens": 11111918.0, + "reward": 0.765625, + "reward_std": 0.3629639744758606, + "rewards/accuracy_reward/mean": 0.765625, + "rewards/accuracy_reward/std": 0.42695629596710205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.08817384392023087, + "sampling/sampling_logp_difference/max": 2.4284448623657227, + "sampling/sampling_logp_difference/mean": 0.015222044661641121, + "step": 32 + }, + { + "clip_ratio/high_max": 0.00014480652316706255, + "clip_ratio/high_mean": 4.443957550392952e-05, + "clip_ratio/low_mean": 0.00012809812687919475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017253770374736632, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 5148.453125, + "completions/mean_terminated_length": 4786.01611328125, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "entropy": 0.5083456933498383, + "epoch": 0.01517939282428703, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003128955839201808, + "learning_rate": 1e-05, + "loss": -0.0622, + "num_tokens": 11451323.0, + "reward": 0.53125, + "reward_std": 0.34034284949302673, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025987625122, + "sampling/importance_sampling_ratio/min": 0.10359863191843033, + "sampling/sampling_logp_difference/max": 2.2672312259674072, + "sampling/sampling_logp_difference/mean": 0.017722681164741516, + "step": 33 + }, + { + "clip_ratio/high_max": 5.51352559341467e-05, + "clip_ratio/high_mean": 1.3783813983536675e-05, + "clip_ratio/low_mean": 7.914142133813584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.292523554904619e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13048.0, + "completions/mean_length": 4609.5, + "completions/mean_terminated_length": 3824.533447265625, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.49830054119229317, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007577431970275939, + "learning_rate": 1e-05, + "loss": 0.0132, + "num_tokens": 11758275.0, + "reward": 0.375, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998879432678223, + "sampling/importance_sampling_ratio/min": 0.05370701104402542, + "sampling/sampling_logp_difference/max": 2.9242117404937744, + "sampling/sampling_logp_difference/mean": 0.01685405895113945, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0001986039806070039, + "clip_ratio/high_mean": 6.727558275088086e-05, + "clip_ratio/low_mean": 0.0003367365798112587, + "clip_ratio/low_min": 6.28791003691731e-05, + "clip_ratio/region_mean": 0.000404012165745371, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14497.0, + "completions/mean_length": 4593.015625, + "completions/mean_terminated_length": 4013.130859375, + "completions/min_length": 1094.0, + "completions/min_terminated_length": 1094.0, + "entropy": 0.3128826189786196, + "epoch": 0.01609935602575897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0026802816428244114, + "learning_rate": 1e-05, + "loss": 0.1212, + "num_tokens": 12063516.0, + "reward": 0.625, + "reward_std": 0.49234145879745483, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999848008155823, + "sampling/importance_sampling_ratio/min": 0.0008915311773307621, + "sampling/sampling_logp_difference/max": 7.0225701332092285, + "sampling/sampling_logp_difference/mean": 0.01317686028778553, + "step": 35 + }, + { + "clip_ratio/high_max": 7.243978234328097e-05, + "clip_ratio/high_mean": 1.8109945585820242e-05, + "clip_ratio/low_mean": 9.390242212248268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011201236907254497, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 5015.171875, + "completions/mean_terminated_length": 4456.048828125, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 0.37973257526755333, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002345556626096368, + "learning_rate": 1e-05, + "loss": -0.0941, + "num_tokens": 12393103.0, + "reward": 0.640625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000877380371094, + "sampling/importance_sampling_ratio/min": 0.1842055469751358, + "sampling/sampling_logp_difference/max": 1.6917030811309814, + "sampling/sampling_logp_difference/mean": 0.0145792867988348, + "step": 36 + }, + { + "clip_ratio/high_max": 0.00014789494525757618, + "clip_ratio/high_mean": 4.601037198881386e-05, + "clip_ratio/low_mean": 0.0003090670288656838, + "clip_ratio/low_min": 1.8808304957929067e-05, + "clip_ratio/region_mean": 0.00035507740903995, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15632.0, + "completions/mean_length": 5598.484375, + "completions/mean_terminated_length": 5068.048828125, + "completions/min_length": 1283.0, + "completions/min_terminated_length": 1283.0, + "entropy": 0.35928424820303917, + "epoch": 0.01701931922723091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0015618539182469249, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 12761230.0, + "reward": 0.546875, + "reward_std": 0.4240131676197052, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999818801879883, + "sampling/importance_sampling_ratio/min": 0.2261282205581665, + "sampling/sampling_logp_difference/max": 2.6031017303466797, + "sampling/sampling_logp_difference/mean": 0.01447785273194313, + "step": 37 + }, + { + "clip_ratio/high_max": 4.21205932070734e-05, + "clip_ratio/high_mean": 1.053014830176835e-05, + "clip_ratio/low_mean": 4.961071590514621e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.014086420691456e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 5366.125, + "completions/mean_terminated_length": 4824.26220703125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.41980869323015213, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0011855819029733539, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 13115038.0, + "reward": 0.5, + "reward_std": 0.17570312321186066, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999375343322754, + "sampling/importance_sampling_ratio/min": 0.15887950360774994, + "sampling/sampling_logp_difference/max": 1.839609146118164, + "sampling/sampling_logp_difference/mean": 0.015550841577351093, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0003506070097500924, + "clip_ratio/high_mean": 0.00010976320845657028, + "clip_ratio/low_mean": 0.0001256909990843269, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023545420481241308, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15317.0, + "completions/max_terminated_length": 15317.0, + "completions/mean_length": 3308.296875, + "completions/mean_terminated_length": 3308.296875, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.38983067497611046, + "epoch": 0.017939282428702852, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0023375866003334522, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 13335329.0, + "reward": 0.59375, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998939037323, + "sampling/importance_sampling_ratio/min": 0.0030945157632231712, + "sampling/sampling_logp_difference/max": 5.77812385559082, + "sampling/sampling_logp_difference/mean": 0.013900299556553364, + "step": 39 + }, + { + "clip_ratio/high_max": 0.000169710889167618, + "clip_ratio/high_mean": 5.673388113791589e-05, + "clip_ratio/low_mean": 0.00029868036835978273, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000355414251316688, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 5426.078125, + "completions/mean_terminated_length": 4497.44091796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.43789565935730934, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0025193989276885986, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 13691110.0, + "reward": 0.5, + "reward_std": 0.45134252309799194, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.14047929644584656, + "sampling/sampling_logp_difference/max": 1.9626951217651367, + "sampling/sampling_logp_difference/mean": 0.015961986035108566, + "step": 40 + }, + { + "clip_ratio/high_max": 8.76178437465569e-05, + "clip_ratio/high_mean": 2.3123878236219753e-05, + "clip_ratio/low_mean": 0.00019285815869807266, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002159820378437871, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16374.0, + "completions/mean_length": 4766.140625, + "completions/mean_terminated_length": 4194.77001953125, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 0.47973789647221565, + "epoch": 0.018859245630174794, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0005962434806860983, + "learning_rate": 1e-05, + "loss": 0.0018, + "num_tokens": 14006911.0, + "reward": 0.484375, + "reward_std": 0.2382849156856537, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.12198832631111145, + "sampling/sampling_logp_difference/max": 2.103829860687256, + "sampling/sampling_logp_difference/mean": 0.016915298998355865, + "step": 41 + }, + { + "clip_ratio/high_max": 6.694088551739696e-05, + "clip_ratio/high_mean": 2.3428712665918283e-05, + "clip_ratio/low_mean": 0.0002706102432057378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002940389586001402, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14691.0, + "completions/mean_length": 5922.421875, + "completions/mean_terminated_length": 4637.66650390625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.42647283896803856, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001872243476100266, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 14394946.0, + "reward": 0.4375, + "reward_std": 0.36295416951179504, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.293357253074646, + "sampling/sampling_logp_difference/max": 2.1049091815948486, + "sampling/sampling_logp_difference/mean": 0.01656758040189743, + "step": 42 + }, + { + "clip_ratio/high_max": 0.00015323197931138566, + "clip_ratio/high_mean": 4.9833591447168146e-05, + "clip_ratio/low_mean": 0.00034982425768248504, + "clip_ratio/low_min": 1.088660519599216e-05, + "clip_ratio/region_mean": 0.0003996578489022795, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 6493.1875, + "completions/mean_terminated_length": 6006.75390625, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "entropy": 0.4782983772456646, + "epoch": 0.019779208831646734, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00166318379342556, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 14821182.0, + "reward": 0.46875, + "reward_std": 0.4092700183391571, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992085456848, + "sampling/importance_sampling_ratio/min": 1.7716387219479657e-06, + "sampling/sampling_logp_difference/max": 13.243605613708496, + "sampling/sampling_logp_difference/mean": 0.018610000610351562, + "step": 43 + }, + { + "clip_ratio/high_max": 6.034070747773512e-05, + "clip_ratio/high_mean": 1.6863068026395922e-05, + "clip_ratio/low_mean": 9.460987712373026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011147294480906567, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16269.0, + "completions/mean_length": 4648.546875, + "completions/mean_terminated_length": 4269.98388671875, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.4597437307238579, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0008557081455364823, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 15128561.0, + "reward": 0.328125, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 0.2670474946498871, + "sampling/sampling_logp_difference/max": 1.320328712463379, + "sampling/sampling_logp_difference/mean": 0.016183078289031982, + "step": 44 + }, + { + "clip_ratio/high_max": 0.00016895902081159875, + "clip_ratio/high_mean": 6.0399999711080454e-05, + "clip_ratio/low_mean": 0.0002296717866556719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029007178636675235, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 6930.234375, + "completions/mean_terminated_length": 6129.06787109375, + "completions/min_length": 682.0, + "completions/min_terminated_length": 682.0, + "entropy": 0.5115556567907333, + "epoch": 0.020699172033118676, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016648141900077462, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 15582168.0, + "reward": 0.5625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000483989715576, + "sampling/importance_sampling_ratio/min": 0.187262162566185, + "sampling/sampling_logp_difference/max": 1.937586784362793, + "sampling/sampling_logp_difference/mean": 0.019788919016718864, + "step": 45 + }, + { + "clip_ratio/high_max": 9.100124134420184e-05, + "clip_ratio/high_mean": 3.351398640916159e-05, + "clip_ratio/low_mean": 0.000253890422754921, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002874044093914563, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16321.0, + "completions/mean_length": 6264.671875, + "completions/mean_terminated_length": 5938.24169921875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.43167873099446297, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013617790536955, + "learning_rate": 1e-05, + "loss": 0.0032, + "num_tokens": 15994715.0, + "reward": 0.640625, + "reward_std": 0.3766237497329712, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.1620832085609436, + "sampling/sampling_logp_difference/max": 1.8196454048156738, + "sampling/sampling_logp_difference/mean": 0.017889156937599182, + "step": 46 + }, + { + "clip_ratio/high_max": 6.15748222116963e-05, + "clip_ratio/high_mean": 1.870576988949324e-05, + "clip_ratio/low_mean": 0.0003191337254975224, + "clip_ratio/low_min": 4.877414176007733e-05, + "clip_ratio/region_mean": 0.0003378394994797418, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12838.0, + "completions/mean_length": 4168.140625, + "completions/mean_terminated_length": 3974.23828125, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 0.433504331856966, + "epoch": 0.021619135234590615, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003133355872705579, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 16272044.0, + "reward": 0.34375, + "reward_std": 0.3377465009689331, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998913407325745, + "sampling/importance_sampling_ratio/min": 0.38697248697280884, + "sampling/sampling_logp_difference/max": 1.4266910552978516, + "sampling/sampling_logp_difference/mean": 0.014272443950176239, + "step": 47 + }, + { + "clip_ratio/high_max": 5.0198698772874195e-05, + "clip_ratio/high_mean": 1.2549674693218549e-05, + "clip_ratio/low_mean": 0.00024944932374637574, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002619989991217153, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5228.15625, + "completions/mean_terminated_length": 4868.2900390625, + "completions/min_length": 1099.0, + "completions/min_terminated_length": 1099.0, + "entropy": 0.6134471148252487, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002945883432403207, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 16616510.0, + "reward": 0.453125, + "reward_std": 0.39560043811798096, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000022053718567, + "sampling/importance_sampling_ratio/min": 0.23671367764472961, + "sampling/sampling_logp_difference/max": 1.4409040212631226, + "sampling/sampling_logp_difference/mean": 0.01892893575131893, + "step": 48 + }, + { + "clip_ratio/high_max": 0.00010992094757966697, + "clip_ratio/high_mean": 3.773104890569812e-05, + "clip_ratio/low_mean": 0.0002085948569856555, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002463259042997379, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15580.0, + "completions/max_terminated_length": 15580.0, + "completions/mean_length": 4286.90625, + "completions/mean_terminated_length": 4286.90625, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.3194341119378805, + "epoch": 0.022539098436062558, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033129912335425615, + "learning_rate": 1e-05, + "loss": -0.0135, + "num_tokens": 16903128.0, + "reward": 0.578125, + "reward_std": 0.4113916754722595, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000847578048706, + "sampling/importance_sampling_ratio/min": 0.14042755961418152, + "sampling/sampling_logp_difference/max": 1.9630634784698486, + "sampling/sampling_logp_difference/mean": 0.0129241943359375, + "step": 49 + }, + { + "clip_ratio/high_max": 0.00010812897107825847, + "clip_ratio/high_mean": 3.162783127663715e-05, + "clip_ratio/low_mean": 0.0001828691292757867, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021449696214403957, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15964.0, + "completions/mean_length": 5032.125, + "completions/mean_terminated_length": 4070.101806640625, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.4777919165790081, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021068111527711153, + "learning_rate": 1e-05, + "loss": -0.0866, + "num_tokens": 17236504.0, + "reward": 0.515625, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 0.2832590341567993, + "sampling/sampling_logp_difference/max": 1.8220746517181396, + "sampling/sampling_logp_difference/mean": 0.01738543063402176, + "step": 50 + }, + { + "clip_ratio/high_max": 0.00012820017036574427, + "clip_ratio/high_mean": 3.647331323008984e-05, + "clip_ratio/low_mean": 0.00025561100665072445, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002920843198808143, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13201.0, + "completions/mean_length": 4803.203125, + "completions/mean_terminated_length": 4619.38134765625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.4494751952588558, + "epoch": 0.023459061637534497, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028032760601490736, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 17553269.0, + "reward": 0.609375, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998152256012, + "sampling/importance_sampling_ratio/min": 0.21100811660289764, + "sampling/sampling_logp_difference/max": 1.5558586120605469, + "sampling/sampling_logp_difference/mean": 0.01737060397863388, + "step": 51 + }, + { + "clip_ratio/high_max": 0.00010267168681821204, + "clip_ratio/high_mean": 3.3487939049337e-05, + "clip_ratio/low_mean": 0.00015384274320240365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018733068225174065, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 7100.3125, + "completions/mean_terminated_length": 6643.7373046875, + "completions/min_length": 1183.0, + "completions/min_terminated_length": 1183.0, + "entropy": 0.5009776279330254, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001591994776390493, + "learning_rate": 1e-05, + "loss": -0.0421, + "num_tokens": 18016729.0, + "reward": 0.453125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 0.09941783547401428, + "sampling/sampling_logp_difference/max": 2.3084237575531006, + "sampling/sampling_logp_difference/mean": 0.01882891170680523, + "step": 52 + }, + { + "clip_ratio/high_max": 0.00016665930297676823, + "clip_ratio/high_mean": 5.2525359819810546e-05, + "clip_ratio/low_mean": 0.0004211304803902749, + "clip_ratio/low_min": 9.529018279863521e-05, + "clip_ratio/region_mean": 0.0004736558298645832, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14227.0, + "completions/mean_length": 6233.796875, + "completions/mean_terminated_length": 5557.1171875, + "completions/min_length": 1338.0, + "completions/min_terminated_length": 1338.0, + "entropy": 0.48881014063954353, + "epoch": 0.02437902483900644, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003694011364132166, + "learning_rate": 1e-05, + "loss": 0.1627, + "num_tokens": 18426140.0, + "reward": 0.625, + "reward_std": 0.3977220952510834, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 0.20072485506534576, + "sampling/sampling_logp_difference/max": 1.6058201789855957, + "sampling/sampling_logp_difference/mean": 0.01879170536994934, + "step": 53 + }, + { + "clip_ratio/high_max": 0.00012100895446565119, + "clip_ratio/high_mean": 4.9377299660591234e-05, + "clip_ratio/low_mean": 0.00019421957949816715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024359687631658744, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15854.0, + "completions/mean_length": 5629.03125, + "completions/mean_terminated_length": 5282.0966796875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.3631018362939358, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001484633656218648, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 18794958.0, + "reward": 0.609375, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000042200088501, + "sampling/importance_sampling_ratio/min": 0.002677773591130972, + "sampling/sampling_logp_difference/max": 5.922769546508789, + "sampling/sampling_logp_difference/mean": 0.013976464979350567, + "step": 54 + }, + { + "clip_ratio/high_max": 0.00021361040307965595, + "clip_ratio/high_mean": 8.756921079111635e-05, + "clip_ratio/low_mean": 0.0002042179089585261, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029178711429267423, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 5366.453125, + "completions/mean_terminated_length": 5191.57177734375, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.34573371335864067, + "epoch": 0.025298988040478382, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0018017840338870883, + "learning_rate": 1e-05, + "loss": -0.0307, + "num_tokens": 19148275.0, + "reward": 0.734375, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 0.44515693187713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999160766601562, + "sampling/importance_sampling_ratio/min": 0.22769968211650848, + "sampling/sampling_logp_difference/max": 1.4797277450561523, + "sampling/sampling_logp_difference/mean": 0.014456957578659058, + "step": 55 + }, + { + "clip_ratio/high_max": 0.00020042336745973444, + "clip_ratio/high_mean": 5.850923639627581e-05, + "clip_ratio/low_mean": 0.00019344742031535134, + "clip_ratio/low_min": 1.594387686054688e-05, + "clip_ratio/region_mean": 0.0002519566587579902, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 5801.921875, + "completions/mean_terminated_length": 5460.564453125, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "entropy": 0.4420101195573807, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0007390208193100989, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 19530718.0, + "reward": 0.421875, + "reward_std": 0.2993341088294983, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999010562896729, + "sampling/importance_sampling_ratio/min": 0.04691341519355774, + "sampling/sampling_logp_difference/max": 3.0594515800476074, + "sampling/sampling_logp_difference/mean": 0.016371876001358032, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0001929260479300865, + "clip_ratio/high_mean": 7.267188334481034e-05, + "clip_ratio/low_mean": 0.00013643273086927366, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020910461648782075, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 3581.09375, + "completions/mean_terminated_length": 3581.09375, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.36750902235507965, + "epoch": 0.02621895124195032, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020201546140015125, + "learning_rate": 1e-05, + "loss": 0.1245, + "num_tokens": 19771076.0, + "reward": 0.578125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000082015991211, + "sampling/importance_sampling_ratio/min": 0.21508392691612244, + "sampling/sampling_logp_difference/max": 2.204270362854004, + "sampling/sampling_logp_difference/mean": 0.013558689504861832, + "step": 57 + }, + { + "clip_ratio/high_max": 0.00019395453546167118, + "clip_ratio/high_mean": 6.426821187233145e-05, + "clip_ratio/low_mean": 0.00017469121939939214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023895943377283402, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14150.0, + "completions/max_terminated_length": 14150.0, + "completions/mean_length": 4180.46875, + "completions/mean_terminated_length": 4180.46875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.4649594761431217, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0028552189469337463, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 20048138.0, + "reward": 0.53125, + "reward_std": 0.4276576042175293, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000083446502686, + "sampling/importance_sampling_ratio/min": 0.2393883913755417, + "sampling/sampling_logp_difference/max": 1.4296680688858032, + "sampling/sampling_logp_difference/mean": 0.017490293830633163, + "step": 58 + }, + { + "clip_ratio/high_max": 0.00014915554584149504, + "clip_ratio/high_mean": 3.9898490058476455e-05, + "clip_ratio/low_mean": 5.383538700698409e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.373387524647114e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15323.0, + "completions/max_terminated_length": 15323.0, + "completions/mean_length": 4642.15625, + "completions/mean_terminated_length": 4642.15625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.41386983543634415, + "epoch": 0.027138914443422264, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014837872004136443, + "learning_rate": 1e-05, + "loss": -0.0232, + "num_tokens": 20355020.0, + "reward": 0.65625, + "reward_std": 0.3198433816432953, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001411437988281, + "sampling/importance_sampling_ratio/min": 0.022514859214425087, + "sampling/sampling_logp_difference/max": 3.7935798168182373, + "sampling/sampling_logp_difference/mean": 0.015344480983912945, + "step": 59 + }, + { + "clip_ratio/high_max": 7.379077214864083e-05, + "clip_ratio/high_mean": 2.223373576271115e-05, + "clip_ratio/low_mean": 0.00013174474815969006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001539784839224012, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15170.0, + "completions/max_terminated_length": 15170.0, + "completions/mean_length": 3369.015625, + "completions/mean_terminated_length": 3369.015625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.46293293312191963, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023857210762798786, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 20579309.0, + "reward": 0.40625, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999541640281677, + "sampling/importance_sampling_ratio/min": 0.00012647465337067842, + "sampling/sampling_logp_difference/max": 8.975468635559082, + "sampling/sampling_logp_difference/mean": 0.016323832795023918, + "step": 60 + }, + { + "clip_ratio/high_max": 0.00010131701310456265, + "clip_ratio/high_mean": 3.068578371312469e-05, + "clip_ratio/low_mean": 0.00017564234258315992, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002063281253867899, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 4186.265625, + "completions/mean_terminated_length": 3992.651123046875, + "completions/min_length": 636.0, + "completions/min_terminated_length": 636.0, + "entropy": 0.4424850195646286, + "epoch": 0.028058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001888959901407361, + "learning_rate": 1e-05, + "loss": -0.0867, + "num_tokens": 20858230.0, + "reward": 0.5, + "reward_std": 0.43401283025741577, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001115798950195, + "sampling/importance_sampling_ratio/min": 0.21523967385292053, + "sampling/sampling_logp_difference/max": 1.5360031127929688, + "sampling/sampling_logp_difference/mean": 0.015638090670108795, + "step": 61 + }, + { + "clip_ratio/high_max": 0.00018883940902014729, + "clip_ratio/high_mean": 6.83412895341462e-05, + "clip_ratio/low_mean": 0.00029582804199890234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003641693292593118, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15784.0, + "completions/mean_length": 8232.328125, + "completions/mean_terminated_length": 7231.24560546875, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.4720785431563854, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0010464832885190845, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 21394763.0, + "reward": 0.421875, + "reward_std": 0.30617380142211914, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436736106873, + "sampling/importance_sampling_ratio/min": 0.05187493562698364, + "sampling/sampling_logp_difference/max": 2.9589195251464844, + "sampling/sampling_logp_difference/mean": 0.019340507686138153, + "step": 62 + }, + { + "clip_ratio/high_max": 7.807558085914934e-05, + "clip_ratio/high_mean": 2.2267657527663687e-05, + "clip_ratio/low_mean": 0.0001811299157452595, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020339757793408353, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 6065.875, + "completions/mean_terminated_length": 5558.42578125, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.5249982811510563, + "epoch": 0.028978840846366146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016154105542227626, + "learning_rate": 1e-05, + "loss": 0.1536, + "num_tokens": 21793091.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 0.05374135076999664, + "sampling/sampling_logp_difference/max": 2.923572540283203, + "sampling/sampling_logp_difference/mean": 0.017961012199521065, + "step": 63 + }, + { + "clip_ratio/high_max": 3.358934282005066e-05, + "clip_ratio/high_mean": 8.397335705012665e-06, + "clip_ratio/low_mean": 3.994480266555911e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.834213746107707e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5830.015625, + "completions/mean_terminated_length": 5489.564453125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.49247242510318756, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013925280654802918, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 22176908.0, + "reward": 0.375, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.00015296634228434414, + "sampling/sampling_logp_difference/max": 8.785292625427246, + "sampling/sampling_logp_difference/mean": 0.016575772315263748, + "step": 64 + }, + { + "clip_ratio/high_max": 0.00016343776496796636, + "clip_ratio/high_mean": 4.387032890917908e-05, + "clip_ratio/low_mean": 0.00010361431054661807, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014748463922842348, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11830.0, + "completions/max_terminated_length": 11830.0, + "completions/mean_length": 3988.203125, + "completions/mean_terminated_length": 3988.203125, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.5831322409212589, + "epoch": 0.029898804047838085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023537185043096542, + "learning_rate": 1e-05, + "loss": 0.0137, + "num_tokens": 22443753.0, + "reward": 0.5625, + "reward_std": 0.33090677857398987, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999172687530518, + "sampling/importance_sampling_ratio/min": 0.017818376421928406, + "sampling/sampling_logp_difference/max": 4.027524948120117, + "sampling/sampling_logp_difference/mean": 0.01679972931742668, + "step": 65 + }, + { + "clip_ratio/high_max": 7.745890934529598e-05, + "clip_ratio/high_mean": 2.375019573719328e-05, + "clip_ratio/low_mean": 0.0002563541038398398, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002801043035560724, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15926.0, + "completions/mean_length": 6300.890625, + "completions/mean_terminated_length": 5804.99951171875, + "completions/min_length": 1197.0, + "completions/min_terminated_length": 1197.0, + "entropy": 0.45622409880161285, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014021627139300108, + "learning_rate": 1e-05, + "loss": 0.147, + "num_tokens": 22857290.0, + "reward": 0.5, + "reward_std": 0.378745436668396, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.30470171570777893, + "sampling/sampling_logp_difference/max": 1.5125246047973633, + "sampling/sampling_logp_difference/mean": 0.017332255840301514, + "step": 66 + }, + { + "clip_ratio/high_max": 9.111399231187534e-05, + "clip_ratio/high_mean": 2.768481340353901e-05, + "clip_ratio/low_mean": 0.00022677685137750814, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025446166773690493, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12245.0, + "completions/mean_length": 5160.984375, + "completions/mean_terminated_length": 4609.03271484375, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.41627733781933784, + "epoch": 0.030818767249310028, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0016577172791585326, + "learning_rate": 1e-05, + "loss": 0.0101, + "num_tokens": 23198369.0, + "reward": 0.6875, + "reward_std": 0.3729792833328247, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000255107879639, + "sampling/importance_sampling_ratio/min": 0.3549080193042755, + "sampling/sampling_logp_difference/max": 1.094315528869629, + "sampling/sampling_logp_difference/mean": 0.016087274998426437, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0001335845809080638, + "clip_ratio/high_mean": 4.6601401209045434e-05, + "clip_ratio/low_mean": 0.00029043503491266165, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003370364320289809, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14731.0, + "completions/mean_length": 4509.109375, + "completions/mean_terminated_length": 3717.4501953125, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.42583196237683296, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0033681185450404882, + "learning_rate": 1e-05, + "loss": -0.0017, + "num_tokens": 23495008.0, + "reward": 0.609375, + "reward_std": 0.49446311593055725, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998528361320496, + "sampling/importance_sampling_ratio/min": 0.00014199235010892153, + "sampling/sampling_logp_difference/max": 8.859737396240234, + "sampling/sampling_logp_difference/mean": 0.016656186431646347, + "step": 68 + }, + { + "clip_ratio/high_max": 0.00010585653399175499, + "clip_ratio/high_mean": 3.166284977851319e-05, + "clip_ratio/low_mean": 0.00032884415986700333, + "clip_ratio/low_min": 3.282563193351962e-05, + "clip_ratio/region_mean": 0.00036050701601197943, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14715.0, + "completions/mean_length": 6722.671875, + "completions/mean_terminated_length": 6411.01611328125, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "entropy": 0.5157046765089035, + "epoch": 0.03173873045078197, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00212118704803288, + "learning_rate": 1e-05, + "loss": -0.0105, + "num_tokens": 23934899.0, + "reward": 0.328125, + "reward_std": 0.2414703369140625, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000383853912354, + "sampling/importance_sampling_ratio/min": 0.15753091871738434, + "sampling/sampling_logp_difference/max": 1.8481335639953613, + "sampling/sampling_logp_difference/mean": 0.019955601543188095, + "step": 69 + }, + { + "clip_ratio/high_max": 0.00011434509951868677, + "clip_ratio/high_mean": 3.723538395661308e-05, + "clip_ratio/low_mean": 0.00033702207656460814, + "clip_ratio/low_min": 1.756851634127088e-05, + "clip_ratio/region_mean": 0.00037425746631924994, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15667.0, + "completions/mean_length": 8077.90625, + "completions/mean_terminated_length": 6539.74072265625, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "entropy": 0.45871395990252495, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001251625595614314, + "learning_rate": 1e-05, + "loss": 0.0493, + "num_tokens": 24461341.0, + "reward": 0.453125, + "reward_std": 0.38664889335632324, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999666213989258, + "sampling/importance_sampling_ratio/min": 0.11547781527042389, + "sampling/sampling_logp_difference/max": 2.158676862716675, + "sampling/sampling_logp_difference/mean": 0.019339658319950104, + "step": 70 + }, + { + "clip_ratio/high_max": 0.00010712722541939002, + "clip_ratio/high_mean": 3.323841019664542e-05, + "clip_ratio/low_mean": 0.0001494285193075484, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018266692586621502, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16074.0, + "completions/mean_length": 5004.703125, + "completions/mean_terminated_length": 4445.0654296875, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 0.5422494150698185, + "epoch": 0.03265869365225391, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019212040351703763, + "learning_rate": 1e-05, + "loss": 0.0703, + "num_tokens": 24791282.0, + "reward": 0.578125, + "reward_std": 0.31983357667922974, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000091791152954, + "sampling/importance_sampling_ratio/min": 0.2572252154350281, + "sampling/sampling_logp_difference/max": 1.4805357456207275, + "sampling/sampling_logp_difference/mean": 0.016796359792351723, + "step": 71 + }, + { + "clip_ratio/high_max": 0.00014950770128052682, + "clip_ratio/high_mean": 4.267084386810893e-05, + "clip_ratio/low_mean": 7.438720058416948e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011705804536177311, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 5883.796875, + "completions/mean_terminated_length": 3939.31494140625, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.35789375379681587, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0005889610038138926, + "learning_rate": 1e-05, + "loss": -0.0215, + "num_tokens": 25176237.0, + "reward": 0.46875, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999967813491821, + "sampling/importance_sampling_ratio/min": 0.08377405256032944, + "sampling/sampling_logp_difference/max": 2.4796319007873535, + "sampling/sampling_logp_difference/mean": 0.014260279014706612, + "step": 72 + }, + { + "clip_ratio/high_max": 0.00012918999300381984, + "clip_ratio/high_mean": 5.179685820166924e-05, + "clip_ratio/low_mean": 0.00011270135428276262, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016449821669084486, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12097.0, + "completions/mean_length": 5086.890625, + "completions/mean_terminated_length": 4722.4677734375, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.38051650673151016, + "epoch": 0.03357865685372585, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003188627539202571, + "learning_rate": 1e-05, + "loss": 0.0989, + "num_tokens": 25517502.0, + "reward": 0.828125, + "reward_std": 0.3571978807449341, + "rewards/accuracy_reward/mean": 0.828125, + "rewards/accuracy_reward/std": 0.38025420904159546, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.19293053448200226, + "sampling/sampling_logp_difference/max": 1.6454250812530518, + "sampling/sampling_logp_difference/mean": 0.013862463645637035, + "step": 73 + }, + { + "clip_ratio/high_max": 4.800585429620696e-05, + "clip_ratio/high_mean": 1.9420242779233376e-05, + "clip_ratio/low_mean": 6.698135666738381e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.640159967399086e-05, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16214.0, + "completions/mean_length": 7180.609375, + "completions/mean_terminated_length": 5865.83935546875, + "completions/min_length": 1171.0, + "completions/min_terminated_length": 1171.0, + "entropy": 0.47618816792964935, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0008011905592866242, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 25986021.0, + "reward": 0.546875, + "reward_std": 0.2519446909427643, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999934732913971, + "sampling/importance_sampling_ratio/min": 0.17329953610897064, + "sampling/sampling_logp_difference/max": 1.7527337074279785, + "sampling/sampling_logp_difference/mean": 0.017364704981446266, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0001653397707741533, + "clip_ratio/high_mean": 4.513738815603574e-05, + "clip_ratio/low_mean": 0.0003383910643606214, + "clip_ratio/low_min": 2.9063008696539328e-05, + "clip_ratio/region_mean": 0.0003835284496744862, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 6114.203125, + "completions/mean_terminated_length": 5429.55029296875, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.4914289750158787, + "epoch": 0.03449862005519779, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0013586069690063596, + "learning_rate": 1e-05, + "loss": -0.0356, + "num_tokens": 26387378.0, + "reward": 0.453125, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000791549682617, + "sampling/importance_sampling_ratio/min": 0.07695373892784119, + "sampling/sampling_logp_difference/max": 2.5645508766174316, + "sampling/sampling_logp_difference/mean": 0.019334372133016586, + "step": 75 + }, + { + "clip_ratio/high_max": 3.184090246577398e-05, + "clip_ratio/high_mean": 1.024358095946809e-05, + "clip_ratio/low_mean": 0.00011936229930142872, + "clip_ratio/low_min": 5.828592748002848e-06, + "clip_ratio/region_mean": 0.0001296058802608968, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14485.0, + "completions/mean_length": 7263.515625, + "completions/mean_terminated_length": 6655.48388671875, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.4553263336420059, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0006440540309995413, + "learning_rate": 1e-05, + "loss": 0.0767, + "num_tokens": 26861243.0, + "reward": 0.546875, + "reward_std": 0.2382849156856537, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980092048645, + "sampling/importance_sampling_ratio/min": 0.15337347984313965, + "sampling/sampling_logp_difference/max": 1.8748793601989746, + "sampling/sampling_logp_difference/mean": 0.016627371311187744, + "step": 76 + }, + { + "clip_ratio/high_max": 5.934812543273438e-05, + "clip_ratio/high_mean": 1.4837031358183594e-05, + "clip_ratio/low_mean": 0.00015511889660047018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016995592795865377, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13194.0, + "completions/max_terminated_length": 13194.0, + "completions/mean_length": 3835.0625, + "completions/mean_terminated_length": 3835.0625, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.5679256543517113, + "epoch": 0.03541858325666973, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015393231296911836, + "learning_rate": 1e-05, + "loss": 0.0614, + "num_tokens": 27117047.0, + "reward": 0.515625, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999722242355347, + "sampling/importance_sampling_ratio/min": 0.1471611112356186, + "sampling/sampling_logp_difference/max": 1.9162273406982422, + "sampling/sampling_logp_difference/mean": 0.017565816640853882, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0001033300140989013, + "clip_ratio/high_mean": 3.157118726448971e-05, + "clip_ratio/low_mean": 0.00023221444325827179, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026378563097750884, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14929.0, + "completions/max_terminated_length": 14929.0, + "completions/mean_length": 5001.0625, + "completions/mean_terminated_length": 5001.0625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 0.4684673063457012, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0018944795010611415, + "learning_rate": 1e-05, + "loss": 0.1526, + "num_tokens": 27445811.0, + "reward": 0.5, + "reward_std": 0.42081791162490845, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.051326826214790344, + "sampling/sampling_logp_difference/max": 2.9695417881011963, + "sampling/sampling_logp_difference/mean": 0.017393115907907486, + "step": 78 + }, + { + "clip_ratio/high_max": 0.00025518189613649156, + "clip_ratio/high_mean": 7.311715717150946e-05, + "clip_ratio/low_mean": 0.0003523219229464303, + "clip_ratio/low_min": 4.194631037535146e-05, + "clip_ratio/region_mean": 0.0004254390933056129, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11377.0, + "completions/max_terminated_length": 11377.0, + "completions/mean_length": 4863.6875, + "completions/mean_terminated_length": 4863.6875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.48279719054698944, + "epoch": 0.03633854645814168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0036398270167410374, + "learning_rate": 1e-05, + "loss": 0.0757, + "num_tokens": 27768311.0, + "reward": 0.5, + "reward_std": 0.41034358739852905, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999462962150574, + "sampling/importance_sampling_ratio/min": 0.1956295520067215, + "sampling/sampling_logp_difference/max": 1.6315324306488037, + "sampling/sampling_logp_difference/mean": 0.017851797863841057, + "step": 79 + }, + { + "clip_ratio/high_max": 8.58052426337963e-05, + "clip_ratio/high_mean": 2.783080799417803e-05, + "clip_ratio/low_mean": 0.00012623786369658774, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015406867260026047, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13248.0, + "completions/mean_length": 4896.609375, + "completions/mean_terminated_length": 4526.04833984375, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "entropy": 0.4410700872540474, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013588728616014123, + "learning_rate": 1e-05, + "loss": 0.0289, + "num_tokens": 28090318.0, + "reward": 0.5625, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000934600830078, + "sampling/importance_sampling_ratio/min": 0.2560647428035736, + "sampling/sampling_logp_difference/max": 1.3827834129333496, + "sampling/sampling_logp_difference/mean": 0.01850186660885811, + "step": 80 + }, + { + "clip_ratio/high_max": 0.00012984564500584383, + "clip_ratio/high_mean": 4.1093299500971625e-05, + "clip_ratio/low_mean": 0.00019706484090420417, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023815813938199426, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16324.0, + "completions/mean_length": 5411.6875, + "completions/mean_terminated_length": 5237.52392578125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.435161255300045, + "epoch": 0.037258509659613616, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002362700179219246, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 28446506.0, + "reward": 0.609375, + "reward_std": 0.41185659170150757, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999968409538269, + "sampling/importance_sampling_ratio/min": 0.11768288910388947, + "sampling/sampling_logp_difference/max": 2.1397616863250732, + "sampling/sampling_logp_difference/mean": 0.016388364136219025, + "step": 81 + }, + { + "clip_ratio/high_max": 0.00018365592040936463, + "clip_ratio/high_mean": 5.955360620646388e-05, + "clip_ratio/low_mean": 0.00016669651313350187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022625011933996575, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14620.0, + "completions/mean_length": 5077.125, + "completions/mean_terminated_length": 4521.048828125, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 0.390783354640007, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004699930548667908, + "learning_rate": 1e-05, + "loss": 0.1469, + "num_tokens": 28780506.0, + "reward": 0.703125, + "reward_std": 0.26621314883232117, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.4604927599430084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.260940283536911, + "sampling/sampling_logp_difference/max": 1.8336589336395264, + "sampling/sampling_logp_difference/mean": 0.014649204909801483, + "step": 82 + }, + { + "clip_ratio/high_max": 0.00018347952845942928, + "clip_ratio/high_mean": 6.701854022139742e-05, + "clip_ratio/low_mean": 0.0003997059175162576, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00046672445023432374, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13433.0, + "completions/mean_length": 3479.25, + "completions/mean_terminated_length": 3274.412841796875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.46421004086732864, + "epoch": 0.038178472861085555, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004662287421524525, + "learning_rate": 1e-05, + "loss": -0.0959, + "num_tokens": 29014730.0, + "reward": 0.375, + "reward_std": 0.41610968112945557, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000164270401001, + "sampling/importance_sampling_ratio/min": 0.20871604979038239, + "sampling/sampling_logp_difference/max": 1.5667805671691895, + "sampling/sampling_logp_difference/mean": 0.018132932484149933, + "step": 83 + }, + { + "clip_ratio/high_max": 0.00015986033577064518, + "clip_ratio/high_mean": 5.5160472129500704e-05, + "clip_ratio/low_mean": 0.00017546498065712512, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023062545551510993, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10876.0, + "completions/max_terminated_length": 10876.0, + "completions/mean_length": 5015.3125, + "completions/mean_terminated_length": 5015.3125, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "entropy": 0.4448152147233486, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00349896471016109, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 29347366.0, + "reward": 0.59375, + "reward_std": 0.34929439425468445, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999954104423523, + "sampling/importance_sampling_ratio/min": 0.32092222571372986, + "sampling/sampling_logp_difference/max": 1.1365565061569214, + "sampling/sampling_logp_difference/mean": 0.017620427533984184, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0001525146399217192, + "clip_ratio/high_mean": 4.684553550760029e-05, + "clip_ratio/low_mean": 0.0003866927354465588, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004335382654971909, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15504.0, + "completions/max_terminated_length": 15504.0, + "completions/mean_length": 4333.453125, + "completions/mean_terminated_length": 4333.453125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.46303874254226685, + "epoch": 0.0390984360625575, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019385392079129815, + "learning_rate": 1e-05, + "loss": -0.0056, + "num_tokens": 29637331.0, + "reward": 0.515625, + "reward_std": 0.31512534618377686, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468326568604, + "sampling/importance_sampling_ratio/min": 0.10085343569517136, + "sampling/sampling_logp_difference/max": 2.2940869331359863, + "sampling/sampling_logp_difference/mean": 0.017312370240688324, + "step": 85 + }, + { + "clip_ratio/high_max": 9.848577337834286e-05, + "clip_ratio/high_mean": 2.7283510007691802e-05, + "clip_ratio/low_mean": 0.00015025084576336667, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017753436122802668, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11260.0, + "completions/mean_length": 4033.9375, + "completions/mean_terminated_length": 3837.905029296875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 0.41759752854704857, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0066805374808609486, + "learning_rate": 1e-05, + "loss": 0.1099, + "num_tokens": 29906119.0, + "reward": 0.671875, + "reward_std": 0.29355230927467346, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000640153884888, + "sampling/importance_sampling_ratio/min": 0.2963661849498749, + "sampling/sampling_logp_difference/max": 1.216159462928772, + "sampling/sampling_logp_difference/mean": 0.014013087376952171, + "step": 86 + }, + { + "clip_ratio/high_max": 0.00014968111190682976, + "clip_ratio/high_mean": 6.019531133460987e-05, + "clip_ratio/low_mean": 0.0001971712508748169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025736656061781105, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12530.0, + "completions/max_terminated_length": 12530.0, + "completions/mean_length": 4039.625, + "completions/mean_terminated_length": 4039.625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.4908015578985214, + "epoch": 0.04001839926402944, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0031810218933969736, + "learning_rate": 1e-05, + "loss": -0.0429, + "num_tokens": 30175247.0, + "reward": 0.6875, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998473525047302, + "sampling/importance_sampling_ratio/min": 0.12201575934886932, + "sampling/sampling_logp_difference/max": 2.103605031967163, + "sampling/sampling_logp_difference/mean": 0.014932084828615189, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0001242801772605162, + "clip_ratio/high_mean": 3.107004431512905e-05, + "clip_ratio/low_mean": 0.00024339640594917, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00027446644844530965, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14343.0, + "completions/max_terminated_length": 14343.0, + "completions/mean_length": 3642.65625, + "completions/mean_terminated_length": 3642.65625, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.497805830091238, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.007662316784262657, + "learning_rate": 1e-05, + "loss": 0.0203, + "num_tokens": 30417265.0, + "reward": 0.609375, + "reward_std": 0.26196980476379395, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997639060020447, + "sampling/importance_sampling_ratio/min": 0.11714459955692291, + "sampling/sampling_logp_difference/max": 2.144346237182617, + "sampling/sampling_logp_difference/mean": 0.018438715487718582, + "step": 88 + }, + { + "clip_ratio/high_max": 0.00012720484210149152, + "clip_ratio/high_mean": 5.5490041631856e-05, + "clip_ratio/low_mean": 0.0003174601497448748, + "clip_ratio/low_min": 1.4323364666779526e-05, + "clip_ratio/region_mean": 0.0003729501986526884, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16234.0, + "completions/mean_length": 7055.640625, + "completions/mean_terminated_length": 6596.86865234375, + "completions/min_length": 1407.0, + "completions/min_terminated_length": 1407.0, + "entropy": 0.4791577495634556, + "epoch": 0.04093836246550138, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0010236409725621343, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 30879706.0, + "reward": 0.5, + "reward_std": 0.3230288028717041, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.1733396202325821, + "sampling/sampling_logp_difference/max": 1.75250244140625, + "sampling/sampling_logp_difference/mean": 0.017151571810245514, + "step": 89 + }, + { + "clip_ratio/high_max": 0.00010716462838900043, + "clip_ratio/high_mean": 3.53956390881649e-05, + "clip_ratio/low_mean": 0.0002695056762149761, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00030490130939142546, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16087.0, + "completions/mean_length": 6766.734375, + "completions/mean_terminated_length": 6125.58349609375, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 0.5393588915467262, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016526335384696722, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 31324809.0, + "reward": 0.375, + "reward_std": 0.342454731464386, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000569820404053, + "sampling/importance_sampling_ratio/min": 0.009934165515005589, + "sampling/sampling_logp_difference/max": 4.6117753982543945, + "sampling/sampling_logp_difference/mean": 0.018986130133271217, + "step": 90 + }, + { + "clip_ratio/high_max": 7.089101882229443e-05, + "clip_ratio/high_mean": 2.2431363390751358e-05, + "clip_ratio/low_mean": 0.00013420935329122585, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015664071861465345, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15215.0, + "completions/mean_length": 6582.734375, + "completions/mean_terminated_length": 5379.0703125, + "completions/min_length": 750.0, + "completions/min_terminated_length": 750.0, + "entropy": 0.5493632070720196, + "epoch": 0.04185832566697332, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0012266625417396426, + "learning_rate": 1e-05, + "loss": -0.0121, + "num_tokens": 31754640.0, + "reward": 0.515625, + "reward_std": 0.3266732692718506, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999552965164185, + "sampling/importance_sampling_ratio/min": 0.026827236637473106, + "sampling/sampling_logp_difference/max": 3.618337631225586, + "sampling/sampling_logp_difference/mean": 0.01922820881009102, + "step": 91 + }, + { + "clip_ratio/high_max": 0.00015220932891679695, + "clip_ratio/high_mean": 5.230503052189306e-05, + "clip_ratio/low_mean": 0.00028057711733708857, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003328821453578712, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14355.0, + "completions/max_terminated_length": 14355.0, + "completions/mean_length": 4673.609375, + "completions/mean_terminated_length": 4673.609375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.37891076132655144, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0038514919579029083, + "learning_rate": 1e-05, + "loss": -0.0397, + "num_tokens": 32063375.0, + "reward": 0.671875, + "reward_std": 0.3908922076225281, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999393224716187, + "sampling/importance_sampling_ratio/min": 0.20188941061496735, + "sampling/sampling_logp_difference/max": 1.6000351905822754, + "sampling/sampling_logp_difference/mean": 0.014766812324523926, + "step": 92 + }, + { + "clip_ratio/high_max": 0.00011865788155773771, + "clip_ratio/high_mean": 4.490372168675094e-05, + "clip_ratio/low_mean": 0.00023933520606078673, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028423893309081905, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10753.0, + "completions/max_terminated_length": 10753.0, + "completions/mean_length": 4133.34375, + "completions/mean_terminated_length": 4133.34375, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.5132806189358234, + "epoch": 0.042778288868445265, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014511283952742815, + "learning_rate": 1e-05, + "loss": 0.016, + "num_tokens": 32347861.0, + "reward": 0.40625, + "reward_std": 0.3061639666557312, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.2521326541900635, + "sampling/sampling_logp_difference/max": 1.3777999877929688, + "sampling/sampling_logp_difference/mean": 0.017015758901834488, + "step": 93 + }, + { + "clip_ratio/high_max": 0.00018065326321448083, + "clip_ratio/high_mean": 6.95563029466939e-05, + "clip_ratio/low_mean": 0.00028433307852537837, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003538893797667697, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15960.0, + "completions/mean_length": 6026.265625, + "completions/mean_terminated_length": 5516.86865234375, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.42576640471816063, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002165265381336212, + "learning_rate": 1e-05, + "loss": 0.0424, + "num_tokens": 32742870.0, + "reward": 0.609375, + "reward_std": 0.4523906111717224, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000587701797485, + "sampling/importance_sampling_ratio/min": 0.011113330721855164, + "sampling/sampling_logp_difference/max": 4.49960994720459, + "sampling/sampling_logp_difference/mean": 0.01895231008529663, + "step": 94 + }, + { + "clip_ratio/high_max": 7.521962379541947e-05, + "clip_ratio/high_mean": 2.5767211354832398e-05, + "clip_ratio/low_mean": 0.0003009975771419704, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032676478986104485, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15803.0, + "completions/mean_length": 6262.4375, + "completions/mean_terminated_length": 5019.4384765625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.3823938798159361, + "epoch": 0.043698252069917204, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017652231035754085, + "learning_rate": 1e-05, + "loss": -0.0391, + "num_tokens": 33152578.0, + "reward": 0.640625, + "reward_std": 0.31512534618377686, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000849962234497, + "sampling/importance_sampling_ratio/min": 0.08265355229377747, + "sampling/sampling_logp_difference/max": 2.4930975437164307, + "sampling/sampling_logp_difference/mean": 0.015054848976433277, + "step": 95 + }, + { + "clip_ratio/high_max": 0.00019391980549698928, + "clip_ratio/high_mean": 5.29239216575661e-05, + "clip_ratio/low_mean": 0.00014883351195749128, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002017574342971784, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13824.0, + "completions/max_terminated_length": 13824.0, + "completions/mean_length": 3582.4375, + "completions/mean_terminated_length": 3582.4375, + "completions/min_length": 787.0, + "completions/min_terminated_length": 787.0, + "entropy": 0.42068246752023697, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001477578654885292, + "learning_rate": 1e-05, + "loss": 0.1755, + "num_tokens": 33390510.0, + "reward": 0.796875, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.796875, + "rewards/accuracy_reward/std": 0.40550529956817627, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000420808792114, + "sampling/importance_sampling_ratio/min": 0.20339767634868622, + "sampling/sampling_logp_difference/max": 1.5925922393798828, + "sampling/sampling_logp_difference/mean": 0.013980602845549583, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0001818037626435398, + "clip_ratio/high_mean": 5.415482519310899e-05, + "clip_ratio/low_mean": 0.00012345622963039204, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017761105391400633, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15258.0, + "completions/mean_length": 4848.875, + "completions/mean_terminated_length": 4476.77392578125, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "entropy": 0.3348248451948166, + "epoch": 0.04461821527138914, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017093514325097203, + "learning_rate": 1e-05, + "loss": 0.119, + "num_tokens": 33711878.0, + "reward": 0.703125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.4604927599430084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999937117099762, + "sampling/importance_sampling_ratio/min": 0.08507421612739563, + "sampling/sampling_logp_difference/max": 2.464231252670288, + "sampling/sampling_logp_difference/mean": 0.013996141962707043, + "step": 97 + }, + { + "clip_ratio/high_max": 8.166239786078222e-05, + "clip_ratio/high_mean": 3.0598509965784615e-05, + "clip_ratio/low_mean": 0.0001227793386533449, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00015337784543589805, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12150.0, + "completions/max_terminated_length": 12150.0, + "completions/mean_length": 3608.53125, + "completions/mean_terminated_length": 3608.53125, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.4186965227127075, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0035578685346990824, + "learning_rate": 1e-05, + "loss": -0.02, + "num_tokens": 33952336.0, + "reward": 0.734375, + "reward_std": 0.41398805379867554, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 0.44515693187713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000457763671875, + "sampling/importance_sampling_ratio/min": 0.21771050989627838, + "sampling/sampling_logp_difference/max": 1.8422369956970215, + "sampling/sampling_logp_difference/mean": 0.015421013347804546, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0002565569302532822, + "clip_ratio/high_mean": 7.735242525086505e-05, + "clip_ratio/low_mean": 0.00022900168551132083, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003063541153096594, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16100.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 5686.828125, + "completions/mean_terminated_length": 5686.828125, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.3955523520708084, + "epoch": 0.04553817847286108, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0015768167795613408, + "learning_rate": 1e-05, + "loss": -0.0626, + "num_tokens": 34325829.0, + "reward": 0.5, + "reward_std": 0.3682710528373718, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000638961791992, + "sampling/importance_sampling_ratio/min": 0.1319275200366974, + "sampling/sampling_logp_difference/max": 2.0255026817321777, + "sampling/sampling_logp_difference/mean": 0.01693328656256199, + "step": 99 + }, + { + "clip_ratio/high_max": 0.00017156526701000985, + "clip_ratio/high_mean": 4.4765379698219476e-05, + "clip_ratio/low_mean": 0.00013393372819336946, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017869910379886278, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13278.0, + "completions/max_terminated_length": 13278.0, + "completions/mean_length": 4955.796875, + "completions/mean_terminated_length": 4955.796875, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.41905970498919487, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016299523413181305, + "learning_rate": 1e-05, + "loss": -0.0648, + "num_tokens": 34650848.0, + "reward": 0.578125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999072551727295, + "sampling/importance_sampling_ratio/min": 0.27319714426994324, + "sampling/sampling_logp_difference/max": 1.2975616455078125, + "sampling/sampling_logp_difference/mean": 0.016213715076446533, + "step": 100 + }, + { + "clip_ratio/high_max": 0.000201842942260555, + "clip_ratio/high_mean": 8.582275131630013e-05, + "clip_ratio/low_mean": 4.658012494473951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013240287626103964, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7215.21875, + "completions/mean_terminated_length": 7069.68310546875, + "completions/min_length": 1205.0, + "completions/min_terminated_length": 1205.0, + "entropy": 0.41243599355220795, + "epoch": 0.04645814167433303, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003915950655937195, + "learning_rate": 1e-05, + "loss": 0.0245, + "num_tokens": 35123918.0, + "reward": 0.65625, + "reward_std": 0.2346404492855072, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000272989273071, + "sampling/importance_sampling_ratio/min": 6.35706098872646e-10, + "sampling/sampling_logp_difference/max": 21.176284790039062, + "sampling/sampling_logp_difference/mean": 0.01628049463033676, + "step": 101 + }, + { + "clip_ratio/high_max": 0.00018997467668668833, + "clip_ratio/high_mean": 7.346466600210988e-05, + "clip_ratio/low_mean": 0.00024571850167376397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00031918316017254256, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14416.0, + "completions/mean_length": 4967.4375, + "completions/mean_terminated_length": 4599.1611328125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.43091630935668945, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0014639816945418715, + "learning_rate": 1e-05, + "loss": 0.0304, + "num_tokens": 35454658.0, + "reward": 0.5, + "reward_std": 0.3777071237564087, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000555515289307, + "sampling/importance_sampling_ratio/min": 0.2935287654399872, + "sampling/sampling_logp_difference/max": 1.4500041007995605, + "sampling/sampling_logp_difference/mean": 0.016430124640464783, + "step": 102 + }, + { + "clip_ratio/high_max": 0.00027447581123851705, + "clip_ratio/high_mean": 7.783462342558778e-05, + "clip_ratio/low_mean": 0.00025762664154171944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033546126724104397, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14372.0, + "completions/max_terminated_length": 14372.0, + "completions/mean_length": 4758.21875, + "completions/mean_terminated_length": 4758.21875, + "completions/min_length": 1026.0, + "completions/min_terminated_length": 1026.0, + "entropy": 0.5072713866829872, + "epoch": 0.04737810487580497, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027890228666365147, + "learning_rate": 1e-05, + "loss": -0.002, + "num_tokens": 35771336.0, + "reward": 0.53125, + "reward_std": 0.3029785752296448, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000927448272705, + "sampling/importance_sampling_ratio/min": 0.13961824774742126, + "sampling/sampling_logp_difference/max": 1.9688433408737183, + "sampling/sampling_logp_difference/mean": 0.017496878281235695, + "step": 103 + }, + { + "clip_ratio/high_max": 0.00013204321567172883, + "clip_ratio/high_mean": 3.5426355907475227e-05, + "clip_ratio/low_mean": 0.00023678694105910836, + "clip_ratio/low_min": 3.282993930042721e-05, + "clip_ratio/region_mean": 0.0002722132940107258, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13356.0, + "completions/max_terminated_length": 13356.0, + "completions/mean_length": 4473.171875, + "completions/mean_terminated_length": 4473.171875, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.5951492674648762, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004909890703856945, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 36068339.0, + "reward": 0.515625, + "reward_std": 0.3492845892906189, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999641180038452, + "sampling/importance_sampling_ratio/min": 0.18714448809623718, + "sampling/sampling_logp_difference/max": 1.6758742332458496, + "sampling/sampling_logp_difference/mean": 0.01959427446126938, + "step": 104 + }, + { + "clip_ratio/high_max": 0.00016260610209428705, + "clip_ratio/high_mean": 5.445963370220852e-05, + "clip_ratio/low_mean": 0.00027578835397434887, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033024798904079944, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15313.0, + "completions/mean_length": 5181.5625, + "completions/mean_terminated_length": 4630.62255859375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.47477902472019196, + "epoch": 0.04829806807727691, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025038071908056736, + "learning_rate": 1e-05, + "loss": 0.0722, + "num_tokens": 36409575.0, + "reward": 0.453125, + "reward_std": 0.36507582664489746, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000419616699219, + "sampling/importance_sampling_ratio/min": 0.01011443231254816, + "sampling/sampling_logp_difference/max": 4.593791961669922, + "sampling/sampling_logp_difference/mean": 0.017458593472838402, + "step": 105 + }, + { + "clip_ratio/high_max": 8.053758392634336e-05, + "clip_ratio/high_mean": 3.110795205429895e-05, + "clip_ratio/low_mean": 0.0005240299615252297, + "clip_ratio/low_min": 6.53458118904382e-06, + "clip_ratio/region_mean": 0.0005551379072130658, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16081.0, + "completions/mean_length": 8749.125, + "completions/mean_terminated_length": 7335.25927734375, + "completions/min_length": 997.0, + "completions/min_terminated_length": 997.0, + "entropy": 0.586535070091486, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003648218931630254, + "learning_rate": 1e-05, + "loss": 0.0214, + "num_tokens": 36979055.0, + "reward": 0.421875, + "reward_std": 0.2993341088294983, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999933242797852, + "sampling/importance_sampling_ratio/min": 0.0010058816988021135, + "sampling/sampling_logp_difference/max": 6.901890754699707, + "sampling/sampling_logp_difference/mean": 0.023093216121196747, + "step": 106 + }, + { + "clip_ratio/high_max": 0.00013844405839336105, + "clip_ratio/high_mean": 4.615002399077639e-05, + "clip_ratio/low_mean": 0.0001350231077594799, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018117312947651953, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15589.0, + "completions/mean_length": 6056.921875, + "completions/mean_terminated_length": 5368.4501953125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.4401419050991535, + "epoch": 0.04921803127874885, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014414339093491435, + "learning_rate": 1e-05, + "loss": -0.0276, + "num_tokens": 37376482.0, + "reward": 0.34375, + "reward_std": 0.3072218894958496, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000091314315796, + "sampling/importance_sampling_ratio/min": 0.14452841877937317, + "sampling/sampling_logp_difference/max": 1.934279203414917, + "sampling/sampling_logp_difference/mean": 0.017904866486787796, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0001955404550244566, + "clip_ratio/high_mean": 7.653925149497809e-05, + "clip_ratio/low_mean": 0.0002893621494877152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003659013982542092, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13655.0, + "completions/mean_length": 4545.0, + "completions/mean_terminated_length": 3541.69482421875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 0.372543640434742, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0034049772657454014, + "learning_rate": 1e-05, + "loss": 0.1294, + "num_tokens": 37675834.0, + "reward": 0.6875, + "reward_std": 0.39347875118255615, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000131130218506, + "sampling/importance_sampling_ratio/min": 3.0434759537456557e-05, + "sampling/sampling_logp_difference/max": 10.399925231933594, + "sampling/sampling_logp_difference/mean": 0.014691833406686783, + "step": 108 + }, + { + "clip_ratio/high_max": 0.00012398830403981265, + "clip_ratio/high_mean": 3.488012771413196e-05, + "clip_ratio/low_mean": 0.00017011856152748805, + "clip_ratio/low_min": 7.710813406447414e-06, + "clip_ratio/region_mean": 0.00020499869060586207, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16184.0, + "completions/mean_length": 5124.203125, + "completions/mean_terminated_length": 4945.4765625, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.40799567475914955, + "epoch": 0.05013799448022079, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021821665577590466, + "learning_rate": 1e-05, + "loss": 0.0248, + "num_tokens": 38013495.0, + "reward": 0.515625, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999369978904724, + "sampling/importance_sampling_ratio/min": 0.04143543541431427, + "sampling/sampling_logp_difference/max": 3.1836187839508057, + "sampling/sampling_logp_difference/mean": 0.015723641961812973, + "step": 109 + }, + { + "clip_ratio/high_max": 0.00011402473955968162, + "clip_ratio/high_mean": 2.8506184889920405e-05, + "clip_ratio/low_mean": 0.00014105440459388774, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016956058880168712, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10540.0, + "completions/mean_length": 3167.21875, + "completions/mean_terminated_length": 2957.4287109375, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.4380917586386204, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003855443326756358, + "learning_rate": 1e-05, + "loss": 0.1642, + "num_tokens": 38225933.0, + "reward": 0.640625, + "reward_std": 0.48551157116889954, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000114440917969, + "sampling/importance_sampling_ratio/min": 0.07347333431243896, + "sampling/sampling_logp_difference/max": 2.610832691192627, + "sampling/sampling_logp_difference/mean": 0.014714892953634262, + "step": 110 + }, + { + "clip_ratio/high_max": 0.00015438527952937875, + "clip_ratio/high_mean": 4.432886225913535e-05, + "clip_ratio/low_mean": 0.0001313946268055588, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001757234877004521, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15641.0, + "completions/max_terminated_length": 15641.0, + "completions/mean_length": 3063.59375, + "completions/mean_terminated_length": 3063.59375, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 0.3451516814529896, + "epoch": 0.05105795768169273, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0017416629707440734, + "learning_rate": 1e-05, + "loss": 0.0888, + "num_tokens": 38429819.0, + "reward": 0.5625, + "reward_std": 0.4139782190322876, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999927282333374, + "sampling/importance_sampling_ratio/min": 0.13355965912342072, + "sampling/sampling_logp_difference/max": 2.013206958770752, + "sampling/sampling_logp_difference/mean": 0.01309503149241209, + "step": 111 + }, + { + "clip_ratio/high_max": 0.00011567322871997021, + "clip_ratio/high_mean": 4.967931909050094e-05, + "clip_ratio/low_mean": 0.00022050612915336387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002701854518818436, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 8586.265625, + "completions/mean_terminated_length": 7784.27587890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "entropy": 0.4284644089639187, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018239191267639399, + "learning_rate": 1e-05, + "loss": 0.1554, + "num_tokens": 38995164.0, + "reward": 0.5625, + "reward_std": 0.3924052119255066, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999877393245697, + "sampling/importance_sampling_ratio/min": 0.008349776268005371, + "sampling/sampling_logp_difference/max": 4.785520553588867, + "sampling/sampling_logp_difference/mean": 0.018177181482315063, + "step": 112 + }, + { + "clip_ratio/high_max": 0.00010434241585244308, + "clip_ratio/high_mean": 3.0345908612616768e-05, + "clip_ratio/low_mean": 0.0002064375662484963, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000236783477703284, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16191.0, + "completions/mean_length": 5793.140625, + "completions/mean_terminated_length": 5625.0322265625, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.4497801251709461, + "epoch": 0.05197792088316467, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024066369514912367, + "learning_rate": 1e-05, + "loss": 0.0793, + "num_tokens": 39376541.0, + "reward": 0.515625, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000128746032715, + "sampling/importance_sampling_ratio/min": 0.2231399565935135, + "sampling/sampling_logp_difference/max": 1.4999561309814453, + "sampling/sampling_logp_difference/mean": 0.01722925715148449, + "step": 113 + }, + { + "clip_ratio/high_max": 0.000142209177283803, + "clip_ratio/high_mean": 3.741042246474535e-05, + "clip_ratio/low_mean": 8.439288603767636e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012180330850242171, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6118.15625, + "completions/mean_terminated_length": 5955.20654296875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.4755205847322941, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018325611017644405, + "learning_rate": 1e-05, + "loss": -0.0845, + "num_tokens": 39776703.0, + "reward": 0.46875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999382495880127, + "sampling/importance_sampling_ratio/min": 0.11108703166246414, + "sampling/sampling_logp_difference/max": 2.197441339492798, + "sampling/sampling_logp_difference/mean": 0.016467180103063583, + "step": 114 + }, + { + "clip_ratio/high_max": 0.00013539696647058008, + "clip_ratio/high_mean": 4.4776959612136125e-05, + "clip_ratio/low_mean": 0.0001621112608063413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020688822041847743, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13185.0, + "completions/max_terminated_length": 13185.0, + "completions/mean_length": 4067.65625, + "completions/mean_terminated_length": 4067.65625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.4728252850472927, + "epoch": 0.052897884084636616, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0028504086658358574, + "learning_rate": 1e-05, + "loss": -0.0113, + "num_tokens": 40046041.0, + "reward": 0.578125, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.2996428310871124, + "sampling/sampling_logp_difference/max": 1.2051640748977661, + "sampling/sampling_logp_difference/mean": 0.01653115823864937, + "step": 115 + }, + { + "clip_ratio/high_max": 0.00016086296363937436, + "clip_ratio/high_mean": 5.132767250870529e-05, + "clip_ratio/low_mean": 8.466833241982386e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013599600742963958, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9704.0, + "completions/max_terminated_length": 9704.0, + "completions/mean_length": 3261.0, + "completions/mean_terminated_length": 3261.0, + "completions/min_length": 745.0, + "completions/min_terminated_length": 745.0, + "entropy": 0.5103091672062874, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004390237387269735, + "learning_rate": 1e-05, + "loss": -0.0547, + "num_tokens": 40263945.0, + "reward": 0.6875, + "reward_std": 0.34034284949302673, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 1.7932543754577637, + "sampling/importance_sampling_ratio/mean": 0.9998769760131836, + "sampling/importance_sampling_ratio/min": 0.0008851143647916615, + "sampling/sampling_logp_difference/max": 7.029793739318848, + "sampling/sampling_logp_difference/mean": 0.017080796882510185, + "step": 116 + }, + { + "clip_ratio/high_max": 6.713680249958998e-05, + "clip_ratio/high_mean": 1.6784200624897494e-05, + "clip_ratio/low_mean": 0.00023034057926452078, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024712477647881315, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13991.0, + "completions/mean_length": 5899.171875, + "completions/mean_terminated_length": 5200.18359375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.5374041832983494, + "epoch": 0.053817847286108556, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0015930214431136847, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 40650964.0, + "reward": 0.578125, + "reward_std": 0.1530819982290268, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997994303703308, + "sampling/importance_sampling_ratio/min": 0.25812914967536926, + "sampling/sampling_logp_difference/max": 1.354295253753662, + "sampling/sampling_logp_difference/mean": 0.020320815965533257, + "step": 117 + }, + { + "clip_ratio/high_max": 0.00013986208477945183, + "clip_ratio/high_mean": 4.5305262233341637e-05, + "clip_ratio/low_mean": 0.00014710804316564463, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019241330755903618, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12004.0, + "completions/max_terminated_length": 12004.0, + "completions/mean_length": 3367.328125, + "completions/mean_terminated_length": 3367.328125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.4309644438326359, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029529735911637545, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 40874649.0, + "reward": 0.65625, + "reward_std": 0.4050266742706299, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999914824962616, + "sampling/importance_sampling_ratio/min": 0.5185436010360718, + "sampling/sampling_logp_difference/max": 0.8470115661621094, + "sampling/sampling_logp_difference/mean": 0.014474974945187569, + "step": 118 + }, + { + "clip_ratio/high_max": 0.00012979303846805124, + "clip_ratio/high_mean": 4.9982098062173463e-05, + "clip_ratio/low_mean": 0.00030595043153880397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003559325195965357, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15999.0, + "completions/mean_length": 6069.640625, + "completions/mean_terminated_length": 5905.9208984375, + "completions/min_length": 1211.0, + "completions/min_terminated_length": 1211.0, + "entropy": 0.569359052926302, + "epoch": 0.054737810487580495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002493771258741617, + "learning_rate": 1e-05, + "loss": -0.0114, + "num_tokens": 41272114.0, + "reward": 0.46875, + "reward_std": 0.4082317352294922, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000840425491333, + "sampling/importance_sampling_ratio/min": 0.00010690245835576206, + "sampling/sampling_logp_difference/max": 9.143593788146973, + "sampling/sampling_logp_difference/mean": 0.019345756620168686, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0002575382823124528, + "clip_ratio/high_mean": 8.639247698738473e-05, + "clip_ratio/low_mean": 0.00022337435802910477, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003097668359259842, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14922.0, + "completions/mean_length": 4214.5625, + "completions/mean_terminated_length": 3616.0654296875, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "entropy": 0.37961139529943466, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002614647848531604, + "learning_rate": 1e-05, + "loss": 0.1154, + "num_tokens": 41550902.0, + "reward": 0.765625, + "reward_std": 0.31512534618377686, + "rewards/accuracy_reward/mean": 0.765625, + "rewards/accuracy_reward/std": 0.42695629596710205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.27158376574516296, + "sampling/sampling_logp_difference/max": 1.3034846782684326, + "sampling/sampling_logp_difference/mean": 0.014523299410939217, + "step": 120 + }, + { + "clip_ratio/high_max": 0.00022899942905496573, + "clip_ratio/high_mean": 7.227375863294583e-05, + "clip_ratio/low_mean": 0.0001765698939379945, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024884364665922476, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10759.0, + "completions/mean_length": 3585.65625, + "completions/mean_terminated_length": 3382.508056640625, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "entropy": 0.3841286860406399, + "epoch": 0.05565777368905244, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002633139258250594, + "learning_rate": 1e-05, + "loss": 0.0646, + "num_tokens": 41788800.0, + "reward": 0.5, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000247955322266, + "sampling/importance_sampling_ratio/min": 0.03442913666367531, + "sampling/sampling_logp_difference/max": 3.368852138519287, + "sampling/sampling_logp_difference/mean": 0.014772026799619198, + "step": 121 + }, + { + "clip_ratio/high_max": 0.00011599275330809178, + "clip_ratio/high_mean": 4.372763510218647e-05, + "clip_ratio/low_mean": 0.00016362589440177544, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020735352973133558, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 6827.625, + "completions/mean_terminated_length": 5462.4287109375, + "completions/min_length": 1178.0, + "completions/min_terminated_length": 1178.0, + "entropy": 0.4236124977469444, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022398591972887516, + "learning_rate": 1e-05, + "loss": 0.0504, + "num_tokens": 42236600.0, + "reward": 0.625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999827146530151, + "sampling/importance_sampling_ratio/min": 0.0008994160452857614, + "sampling/sampling_logp_difference/max": 7.01376485824585, + "sampling/sampling_logp_difference/mean": 0.015439807437360287, + "step": 122 + }, + { + "clip_ratio/high_max": 5.419486842583865e-05, + "clip_ratio/high_mean": 2.424228341624257e-05, + "clip_ratio/low_mean": 0.0003505960376060102, + "clip_ratio/low_min": 6.290438614087179e-05, + "clip_ratio/region_mean": 0.00037483832056750543, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15703.0, + "completions/mean_length": 7439.390625, + "completions/mean_terminated_length": 6161.58935546875, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.42251385003328323, + "epoch": 0.05657773689052438, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013182272668927908, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 42722049.0, + "reward": 0.53125, + "reward_std": 0.28247910737991333, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000464916229248, + "sampling/importance_sampling_ratio/min": 0.009381524287164211, + "sampling/sampling_logp_difference/max": 4.669013023376465, + "sampling/sampling_logp_difference/mean": 0.017970317974686623, + "step": 123 + }, + { + "clip_ratio/high_max": 0.00022104287472757278, + "clip_ratio/high_mean": 8.386546346628165e-05, + "clip_ratio/low_mean": 0.00033902134964591824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004228868028803845, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12644.0, + "completions/mean_length": 4582.703125, + "completions/mean_terminated_length": 4395.38134765625, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.45571400970220566, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003415483282878995, + "learning_rate": 1e-05, + "loss": 0.0003, + "num_tokens": 43024382.0, + "reward": 0.625, + "reward_std": 0.45134252309799194, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266862869263, + "sampling/importance_sampling_ratio/min": 0.0020838617347180843, + "sampling/sampling_logp_difference/max": 6.173532485961914, + "sampling/sampling_logp_difference/mean": 0.017238060012459755, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0001726137379591819, + "clip_ratio/high_mean": 5.8308734878664836e-05, + "clip_ratio/low_mean": 0.0001304974630329525, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018880619791161735, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12312.0, + "completions/mean_length": 3796.65625, + "completions/mean_terminated_length": 3177.6064453125, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.36019331216812134, + "epoch": 0.05749770009199632, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004071601200848818, + "learning_rate": 1e-05, + "loss": -0.0787, + "num_tokens": 43277528.0, + "reward": 0.625, + "reward_std": 0.3787454068660736, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999937891960144, + "sampling/importance_sampling_ratio/min": 0.3052186071872711, + "sampling/sampling_logp_difference/max": 1.432037115097046, + "sampling/sampling_logp_difference/mean": 0.01319027692079544, + "step": 125 + }, + { + "clip_ratio/high_max": 7.133460348995868e-05, + "clip_ratio/high_mean": 2.1890245989197865e-05, + "clip_ratio/low_mean": 0.00012619525250556762, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001480855021327443, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 9682.125, + "completions/mean_terminated_length": 8135.53857421875, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.45171455293893814, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0005967849283479154, + "learning_rate": 1e-05, + "loss": 0.0396, + "num_tokens": 43907504.0, + "reward": 0.40625, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.08479011803865433, + "sampling/sampling_logp_difference/max": 2.467576265335083, + "sampling/sampling_logp_difference/mean": 0.01833641156554222, + "step": 126 + }, + { + "clip_ratio/high_max": 0.00015240923221426783, + "clip_ratio/high_mean": 4.380486257105076e-05, + "clip_ratio/low_mean": 0.000209838211958413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002536430743020901, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6274.828125, + "completions/mean_terminated_length": 5600.8837890625, + "completions/min_length": 1218.0, + "completions/min_terminated_length": 1218.0, + "entropy": 0.43740469962358475, + "epoch": 0.05841766329346826, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002006928203627467, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 44318477.0, + "reward": 0.609375, + "reward_std": 0.27883461117744446, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 0.006873338017612696, + "sampling/sampling_logp_difference/max": 4.980105400085449, + "sampling/sampling_logp_difference/mean": 0.0172873605042696, + "step": 127 + }, + { + "clip_ratio/high_max": 3.4462957046343945e-05, + "clip_ratio/high_mean": 8.615739261585986e-06, + "clip_ratio/low_mean": 0.00021862963694729842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022724537666363176, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15911.0, + "completions/max_terminated_length": 15911.0, + "completions/mean_length": 4089.046875, + "completions/mean_terminated_length": 4089.046875, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "entropy": 0.44774849712848663, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00209969119168818, + "learning_rate": 1e-05, + "loss": -0.0304, + "num_tokens": 44589816.0, + "reward": 0.421875, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000078678131104, + "sampling/importance_sampling_ratio/min": 0.29485389590263367, + "sampling/sampling_logp_difference/max": 1.2212753295898438, + "sampling/sampling_logp_difference/mean": 0.014491476118564606, + "step": 128 + }, + { + "clip_ratio/high_max": 0.00016890352435439127, + "clip_ratio/high_mean": 5.8580551922204904e-05, + "clip_ratio/low_mean": 0.00029687383357668296, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035545438731787726, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12514.0, + "completions/max_terminated_length": 12514.0, + "completions/mean_length": 4120.46875, + "completions/mean_terminated_length": 4120.46875, + "completions/min_length": 570.0, + "completions/min_terminated_length": 570.0, + "entropy": 0.37828731164336205, + "epoch": 0.059337626494940204, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00247256294824183, + "learning_rate": 1e-05, + "loss": -0.0719, + "num_tokens": 44863870.0, + "reward": 0.640625, + "reward_std": 0.37298911809921265, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999027252197266, + "sampling/importance_sampling_ratio/min": 0.25366994738578796, + "sampling/sampling_logp_difference/max": 1.3717212677001953, + "sampling/sampling_logp_difference/mean": 0.01514413021504879, + "step": 129 + }, + { + "clip_ratio/high_max": 6.054695404600352e-05, + "clip_ratio/high_mean": 1.513673851150088e-05, + "clip_ratio/low_mean": 0.00011639616241154727, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013153290092304815, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 4336.640625, + "completions/mean_terminated_length": 4145.4130859375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.5505912192165852, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002205714350566268, + "learning_rate": 1e-05, + "loss": 0.0071, + "num_tokens": 45150095.0, + "reward": 0.265625, + "reward_std": 0.3051002323627472, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001391172409058, + "sampling/importance_sampling_ratio/min": 0.06648644804954529, + "sampling/sampling_logp_difference/max": 2.710757255554199, + "sampling/sampling_logp_difference/mean": 0.017366381362080574, + "step": 130 + }, + { + "clip_ratio/high_max": 0.00013182537441025488, + "clip_ratio/high_mean": 3.979877965321066e-05, + "clip_ratio/low_mean": 0.000278371120657539, + "clip_ratio/low_min": 2.8801843654946424e-05, + "clip_ratio/region_mean": 0.00031816989758226555, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15715.0, + "completions/mean_length": 5626.6875, + "completions/mean_terminated_length": 5279.67724609375, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "entropy": 0.4813901446759701, + "epoch": 0.060257589696412144, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0019324850291013718, + "learning_rate": 1e-05, + "loss": 0.0192, + "num_tokens": 45520363.0, + "reward": 0.59375, + "reward_std": 0.3956102430820465, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999158382415771, + "sampling/importance_sampling_ratio/min": 0.0007231447380036116, + "sampling/sampling_logp_difference/max": 7.231901168823242, + "sampling/sampling_logp_difference/mean": 0.01795651763677597, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0001917856679938268, + "clip_ratio/high_mean": 7.190962878667051e-05, + "clip_ratio/low_mean": 0.0003002988987645949, + "clip_ratio/low_min": 2.3995393348741345e-05, + "clip_ratio/region_mean": 0.0003722085129993502, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15459.0, + "completions/max_terminated_length": 15459.0, + "completions/mean_length": 5463.125, + "completions/mean_terminated_length": 5463.125, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "entropy": 0.48498839512467384, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0010704556480050087, + "learning_rate": 1e-05, + "loss": 0.0345, + "num_tokens": 45879459.0, + "reward": 0.5, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000200271606445, + "sampling/importance_sampling_ratio/min": 0.24045711755752563, + "sampling/sampling_logp_difference/max": 1.4252135753631592, + "sampling/sampling_logp_difference/mean": 0.01702497899532318, + "step": 132 + }, + { + "clip_ratio/high_max": 0.00018260821707372088, + "clip_ratio/high_mean": 8.817236493996461e-05, + "clip_ratio/low_mean": 0.00028106225181545597, + "clip_ratio/low_min": 1.3495277016772889e-05, + "clip_ratio/region_mean": 0.00036923462903359905, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15900.0, + "completions/mean_length": 7030.078125, + "completions/mean_terminated_length": 6728.33837890625, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.5583953745663166, + "epoch": 0.06117755289788408, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004807902034372091, + "learning_rate": 1e-05, + "loss": 0.0251, + "num_tokens": 46339208.0, + "reward": 0.4375, + "reward_std": 0.34352827072143555, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000038146972656, + "sampling/importance_sampling_ratio/min": 0.16341879963874817, + "sampling/sampling_logp_difference/max": 1.811439037322998, + "sampling/sampling_logp_difference/mean": 0.02068626880645752, + "step": 133 + }, + { + "clip_ratio/high_max": 6.616325117647648e-05, + "clip_ratio/high_mean": 1.654081279411912e-05, + "clip_ratio/low_mean": 0.00023565934952785028, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002522001623219694, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10388.0, + "completions/mean_length": 3639.796875, + "completions/mean_terminated_length": 3228.693359375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "entropy": 0.3612133227288723, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003295397385954857, + "learning_rate": 1e-05, + "loss": 0.0954, + "num_tokens": 46581867.0, + "reward": 0.53125, + "reward_std": 0.2756393849849701, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999962329864502, + "sampling/importance_sampling_ratio/min": 0.26906853914260864, + "sampling/sampling_logp_difference/max": 1.3127890825271606, + "sampling/sampling_logp_difference/mean": 0.013889246620237827, + "step": 134 + }, + { + "clip_ratio/high_max": 0.00010353518246120075, + "clip_ratio/high_mean": 2.7723654284272925e-05, + "clip_ratio/low_mean": 0.00025271423010053695, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028043788643117296, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15826.0, + "completions/mean_length": 6138.40625, + "completions/mean_terminated_length": 5270.1357421875, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "entropy": 0.46735797077417374, + "epoch": 0.06209751609935603, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0006881815497763455, + "learning_rate": 1e-05, + "loss": -0.0115, + "num_tokens": 46984573.0, + "reward": 0.375, + "reward_std": 0.2619796395301819, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001217126846313, + "sampling/importance_sampling_ratio/min": 0.1895102858543396, + "sampling/sampling_logp_difference/max": 1.6633119583129883, + "sampling/sampling_logp_difference/mean": 0.018007703125476837, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0001601026592652488, + "clip_ratio/high_mean": 6.810909269461263e-05, + "clip_ratio/low_mean": 0.00029509376508940477, + "clip_ratio/low_min": 8.118738332996145e-05, + "clip_ratio/region_mean": 0.0003632028547144728, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6203.078125, + "completions/mean_terminated_length": 6041.4765625, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "entropy": 0.34924061596393585, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0030411158222705126, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 47390530.0, + "reward": 0.5625, + "reward_std": 0.45134252309799194, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000066757202148, + "sampling/importance_sampling_ratio/min": 0.0038473873864859343, + "sampling/sampling_logp_difference/max": 5.560360908508301, + "sampling/sampling_logp_difference/mean": 0.014004556462168694, + "step": 136 + }, + { + "clip_ratio/high_max": 0.00013779216351395007, + "clip_ratio/high_mean": 3.622300414463098e-05, + "clip_ratio/low_mean": 0.000246863734901126, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002830867379088886, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16327.0, + "completions/mean_length": 5275.09375, + "completions/mean_terminated_length": 4534.5, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.48541659861803055, + "epoch": 0.06301747930082796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019960978534072638, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 47738616.0, + "reward": 0.5625, + "reward_std": 0.3787454068660736, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998708367347717, + "sampling/importance_sampling_ratio/min": 0.25954926013946533, + "sampling/sampling_logp_difference/max": 2.035876750946045, + "sampling/sampling_logp_difference/mean": 0.01738009974360466, + "step": 137 + }, + { + "clip_ratio/high_max": 6.566441879840568e-05, + "clip_ratio/high_mean": 1.9573946701711975e-05, + "clip_ratio/low_mean": 0.00018548900698078796, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020506295550148934, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 6902.90625, + "completions/mean_terminated_length": 6270.83349609375, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "entropy": 0.5758580937981606, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018249191343784332, + "learning_rate": 1e-05, + "loss": 0.0649, + "num_tokens": 48190938.0, + "reward": 0.3125, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000348091125488, + "sampling/importance_sampling_ratio/min": 0.0013268361799418926, + "sampling/sampling_logp_difference/max": 6.624958038330078, + "sampling/sampling_logp_difference/mean": 0.02041659690439701, + "step": 138 + }, + { + "clip_ratio/high_max": 0.00017541761735628825, + "clip_ratio/high_mean": 4.709997801910504e-05, + "clip_ratio/low_mean": 0.0001230241168741486, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001701240935290116, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 6085.34375, + "completions/mean_terminated_length": 5578.85205078125, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.42954346910119057, + "epoch": 0.06393744250229991, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.000927706656511873, + "learning_rate": 1e-05, + "loss": -0.0193, + "num_tokens": 48589088.0, + "reward": 0.515625, + "reward_std": 0.17782479524612427, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999110102653503, + "sampling/importance_sampling_ratio/min": 0.015892520546913147, + "sampling/sampling_logp_difference/max": 4.14190673828125, + "sampling/sampling_logp_difference/mean": 0.016232255846261978, + "step": 139 + }, + { + "clip_ratio/high_max": 0.00010605377701722318, + "clip_ratio/high_mean": 2.6513444254305796e-05, + "clip_ratio/low_mean": 0.00017001426112983609, + "clip_ratio/low_min": 9.667440281191375e-06, + "clip_ratio/region_mean": 0.00019652770470202086, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11692.0, + "completions/mean_length": 4587.140625, + "completions/mean_terminated_length": 3800.68359375, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "entropy": 0.3784865029156208, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002462266944348812, + "learning_rate": 1e-05, + "loss": 0.1149, + "num_tokens": 48891993.0, + "reward": 0.75, + "reward_std": 0.3119301199913025, + "rewards/accuracy_reward/mean": 0.75, + "rewards/accuracy_reward/std": 0.4364357888698578, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999458193778992, + "sampling/importance_sampling_ratio/min": 0.04250956326723099, + "sampling/sampling_logp_difference/max": 3.1580262184143066, + "sampling/sampling_logp_difference/mean": 0.013811696320772171, + "step": 140 + }, + { + "clip_ratio/high_max": 2.936265809694305e-05, + "clip_ratio/high_mean": 7.340664524235763e-06, + "clip_ratio/low_mean": 4.854745839111274e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2195410363347037e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14722.0, + "completions/mean_length": 5666.703125, + "completions/mean_terminated_length": 5139.62255859375, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "entropy": 0.39824797213077545, + "epoch": 0.06485740570377185, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0003115626168437302, + "learning_rate": 1e-05, + "loss": -0.0184, + "num_tokens": 49264062.0, + "reward": 0.5625, + "reward_std": 0.1462520956993103, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.06243892386555672, + "sampling/sampling_logp_difference/max": 2.773566484451294, + "sampling/sampling_logp_difference/mean": 0.015739524737000465, + "step": 141 + }, + { + "clip_ratio/high_max": 0.000158108177856775, + "clip_ratio/high_mean": 5.2915278502041474e-05, + "clip_ratio/low_mean": 0.00016461382892885013, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021752910970462835, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15259.0, + "completions/mean_length": 6887.734375, + "completions/mean_terminated_length": 5721.5263671875, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "entropy": 0.38760824128985405, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0006745394202880561, + "learning_rate": 1e-05, + "loss": 0.0056, + "num_tokens": 49713061.0, + "reward": 0.65625, + "reward_std": 0.34034284949302673, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999780654907227, + "sampling/importance_sampling_ratio/min": 0.06032606586813927, + "sampling/sampling_logp_difference/max": 2.8079910278320312, + "sampling/sampling_logp_difference/mean": 0.01489229779690504, + "step": 142 + }, + { + "clip_ratio/high_max": 0.00015935591181914788, + "clip_ratio/high_mean": 6.387877647284768e-05, + "clip_ratio/low_mean": 0.0001730375179249677, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023691629667155212, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15544.0, + "completions/mean_length": 5005.8125, + "completions/mean_terminated_length": 4446.2294921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.4111749045550823, + "epoch": 0.06577736890524379, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002489981707185507, + "learning_rate": 1e-05, + "loss": 0.1439, + "num_tokens": 50041409.0, + "reward": 0.609375, + "reward_std": 0.38664886355400085, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 0.08146519958972931, + "sampling/sampling_logp_difference/max": 2.5075793266296387, + "sampling/sampling_logp_difference/mean": 0.015489751473069191, + "step": 143 + }, + { + "clip_ratio/high_max": 0.00025049392115761293, + "clip_ratio/high_mean": 8.004182222975942e-05, + "clip_ratio/low_mean": 0.00020983324338885723, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028987506448174827, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16315.0, + "completions/max_terminated_length": 16315.0, + "completions/mean_length": 5179.40625, + "completions/mean_terminated_length": 5179.40625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.41279230639338493, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002926065819337964, + "learning_rate": 1e-05, + "loss": 0.1158, + "num_tokens": 50382171.0, + "reward": 0.671875, + "reward_std": 0.40822193026542664, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999444484710693, + "sampling/importance_sampling_ratio/min": 0.25520530343055725, + "sampling/sampling_logp_difference/max": 1.6402530670166016, + "sampling/sampling_logp_difference/mean": 0.015069128945469856, + "step": 144 + }, + { + "clip_ratio/high_max": 0.00014712631036672974, + "clip_ratio/high_mean": 5.468455719892518e-05, + "clip_ratio/low_mean": 0.00023898459858173737, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002936691580543993, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10295.0, + "completions/mean_length": 4353.640625, + "completions/mean_terminated_length": 3965.564453125, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "entropy": 0.3548976257443428, + "epoch": 0.06669733210671573, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0013896445743739605, + "learning_rate": 1e-05, + "loss": 0.1011, + "num_tokens": 50669884.0, + "reward": 0.609375, + "reward_std": 0.4150616228580475, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000076174736023, + "sampling/importance_sampling_ratio/min": 0.3658331036567688, + "sampling/sampling_logp_difference/max": 1.0578279495239258, + "sampling/sampling_logp_difference/mean": 0.01400618627667427, + "step": 145 + }, + { + "clip_ratio/high_max": 0.00018550837739894632, + "clip_ratio/high_mean": 5.588274325418752e-05, + "clip_ratio/low_mean": 0.00013934067465015687, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019522341381161823, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11380.0, + "completions/max_terminated_length": 11380.0, + "completions/mean_length": 3818.765625, + "completions/mean_terminated_length": 3818.765625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.4553304873406887, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022751668002456427, + "learning_rate": 1e-05, + "loss": 0.0159, + "num_tokens": 50923221.0, + "reward": 0.59375, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999451637268066, + "sampling/importance_sampling_ratio/min": 0.4037473499774933, + "sampling/sampling_logp_difference/max": 0.906965970993042, + "sampling/sampling_logp_difference/mean": 0.017479849979281425, + "step": 146 + }, + { + "clip_ratio/high_max": 0.00012334420534898527, + "clip_ratio/high_mean": 3.775972527364502e-05, + "clip_ratio/low_mean": 0.00024554891206207685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028330863642622717, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15170.0, + "completions/mean_length": 4132.859375, + "completions/mean_terminated_length": 3938.39697265625, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "entropy": 0.3993053063750267, + "epoch": 0.06761729530818768, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003922725562006235, + "learning_rate": 1e-05, + "loss": 0.0842, + "num_tokens": 51197700.0, + "reward": 0.703125, + "reward_std": 0.3740273714065552, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.4604927599430084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.09134452044963837, + "sampling/sampling_logp_difference/max": 2.3931169509887695, + "sampling/sampling_logp_difference/mean": 0.01436243113130331, + "step": 147 + }, + { + "clip_ratio/high_max": 6.435603609133977e-05, + "clip_ratio/high_mean": 1.806610680432641e-05, + "clip_ratio/low_mean": 0.0001659406625549309, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001840067711782467, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16096.0, + "completions/mean_length": 5267.796875, + "completions/mean_terminated_length": 4325.74560546875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.38123703747987747, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0020941398106515408, + "learning_rate": 1e-05, + "loss": -0.0609, + "num_tokens": 51543671.0, + "reward": 0.53125, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316930770874, + "sampling/importance_sampling_ratio/min": 0.23037536442279816, + "sampling/sampling_logp_difference/max": 2.5325393676757812, + "sampling/sampling_logp_difference/mean": 0.014637207612395287, + "step": 148 + }, + { + "clip_ratio/high_max": 9.257747115043458e-05, + "clip_ratio/high_mean": 3.454523766777129e-05, + "clip_ratio/low_mean": 0.00017795059829950333, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021249583460303256, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 7131.109375, + "completions/mean_terminated_length": 5994.78955078125, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "entropy": 0.5104082711040974, + "epoch": 0.06853725850965961, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00497112050652504, + "learning_rate": 1e-05, + "loss": 0.0182, + "num_tokens": 52009022.0, + "reward": 0.421875, + "reward_std": 0.2382849156856537, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000052571296692, + "sampling/importance_sampling_ratio/min": 0.1670650690793991, + "sampling/sampling_logp_difference/max": 2.297494888305664, + "sampling/sampling_logp_difference/mean": 0.01985531486570835, + "step": 149 + }, + { + "clip_ratio/high_max": 0.00023302506269828882, + "clip_ratio/high_mean": 6.607658588109189e-05, + "clip_ratio/low_mean": 0.0002972222391690593, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00036329882095742505, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14592.0, + "completions/mean_length": 5244.09375, + "completions/mean_terminated_length": 4696.22900390625, + "completions/min_length": 979.0, + "completions/min_terminated_length": 979.0, + "entropy": 0.43432193621993065, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0020549402106553316, + "learning_rate": 1e-05, + "loss": 0.0465, + "num_tokens": 52355452.0, + "reward": 0.578125, + "reward_std": 0.32878512144088745, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999597668647766, + "sampling/importance_sampling_ratio/min": 0.12826821208000183, + "sampling/sampling_logp_difference/max": 2.0536317825317383, + "sampling/sampling_logp_difference/mean": 0.01682550646364689, + "step": 150 + }, + { + "clip_ratio/high_max": 9.257711553800618e-05, + "clip_ratio/high_mean": 2.4764625095485826e-05, + "clip_ratio/low_mean": 0.00017624517386138905, + "clip_ratio/low_min": 3.04195455100853e-05, + "clip_ratio/region_mean": 0.00020100980009374325, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13305.0, + "completions/mean_length": 4665.59375, + "completions/mean_terminated_length": 4287.58056640625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.4567765109241009, + "epoch": 0.06945722171113156, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019571157172322273, + "learning_rate": 1e-05, + "loss": 0.1202, + "num_tokens": 52661730.0, + "reward": 0.609375, + "reward_std": 0.3766237497329712, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001718997955322, + "sampling/importance_sampling_ratio/min": 0.05863168090581894, + "sampling/sampling_logp_difference/max": 2.836480140686035, + "sampling/sampling_logp_difference/mean": 0.014873203821480274, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0001826390007408918, + "clip_ratio/high_mean": 4.919741900266672e-05, + "clip_ratio/low_mean": 0.00017080424390769622, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002200016633651103, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15613.0, + "completions/mean_length": 5884.078125, + "completions/mean_terminated_length": 4994.25439453125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.46595389023423195, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001585305784828961, + "learning_rate": 1e-05, + "loss": -0.01, + "num_tokens": 53047807.0, + "reward": 0.6875, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.22881244122982025, + "sampling/sampling_logp_difference/max": 1.76997709274292, + "sampling/sampling_logp_difference/mean": 0.017738256603479385, + "step": 152 + }, + { + "clip_ratio/high_max": 0.00021771475485365954, + "clip_ratio/high_mean": 6.966911666950182e-05, + "clip_ratio/low_mean": 0.0002597284528746968, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003293975751148537, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15895.0, + "completions/mean_length": 6931.078125, + "completions/mean_terminated_length": 6781.0322265625, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 0.4630916491150856, + "epoch": 0.0703771849126035, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001962294103577733, + "learning_rate": 1e-05, + "loss": 0.0243, + "num_tokens": 53499988.0, + "reward": 0.578125, + "reward_std": 0.32407689094543457, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897910118103, + "sampling/importance_sampling_ratio/min": 0.250827819108963, + "sampling/sampling_logp_difference/max": 1.3829885721206665, + "sampling/sampling_logp_difference/mean": 0.019276238977909088, + "step": 153 + }, + { + "clip_ratio/high_max": 9.259459875465836e-05, + "clip_ratio/high_mean": 2.314864968866459e-05, + "clip_ratio/low_mean": 0.0001647196718295163, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001878683215181809, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16380.0, + "completions/max_terminated_length": 16380.0, + "completions/mean_length": 4187.609375, + "completions/mean_terminated_length": 4187.609375, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.4736147038638592, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0039059605915099382, + "learning_rate": 1e-05, + "loss": -0.0232, + "num_tokens": 53777891.0, + "reward": 0.515625, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000485181808472, + "sampling/importance_sampling_ratio/min": 0.059286389499902725, + "sampling/sampling_logp_difference/max": 2.825375556945801, + "sampling/sampling_logp_difference/mean": 0.017982449382543564, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0002281807282997761, + "clip_ratio/high_mean": 6.215211851667846e-05, + "clip_ratio/low_mean": 0.00041431118188484106, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004764632985825301, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11462.0, + "completions/max_terminated_length": 11462.0, + "completions/mean_length": 4636.609375, + "completions/mean_terminated_length": 4636.609375, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.4699357636272907, + "epoch": 0.07129714811407543, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015674931928515434, + "learning_rate": 1e-05, + "loss": 0.0001, + "num_tokens": 54083730.0, + "reward": 0.421875, + "reward_std": 0.32407689094543457, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999392032623291, + "sampling/importance_sampling_ratio/min": 0.3012267053127289, + "sampling/sampling_logp_difference/max": 1.4397335052490234, + "sampling/sampling_logp_difference/mean": 0.018028534948825836, + "step": 155 + }, + { + "clip_ratio/high_max": 0.00019255842562415637, + "clip_ratio/high_mean": 6.385802771546878e-05, + "clip_ratio/low_mean": 0.00015325745880545583, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021711548879466136, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14616.0, + "completions/mean_length": 4546.90625, + "completions/mean_terminated_length": 4165.064453125, + "completions/min_length": 1046.0, + "completions/min_terminated_length": 1046.0, + "entropy": 0.3721245974302292, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0034301101695746183, + "learning_rate": 1e-05, + "loss": -0.0319, + "num_tokens": 54384076.0, + "reward": 0.703125, + "reward_std": 0.344576358795166, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.4604927599430084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.3777761459350586, + "sampling/sampling_logp_difference/max": 0.9734535217285156, + "sampling/sampling_logp_difference/mean": 0.013778747990727425, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0002033198470599018, + "clip_ratio/high_mean": 8.211985141315381e-05, + "clip_ratio/low_mean": 0.00019877607019225252, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00028089592706237454, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15427.0, + "completions/mean_length": 5861.140625, + "completions/mean_terminated_length": 5159.61669921875, + "completions/min_length": 922.0, + "completions/min_terminated_length": 922.0, + "entropy": 0.402770210057497, + "epoch": 0.07221711131554738, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0019413913832977414, + "learning_rate": 1e-05, + "loss": -0.015, + "num_tokens": 54768301.0, + "reward": 0.6875, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.6875, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999744892120361, + "sampling/importance_sampling_ratio/min": 0.0876406580209732, + "sampling/sampling_logp_difference/max": 2.4345102310180664, + "sampling/sampling_logp_difference/mean": 0.01538553275167942, + "step": 157 + }, + { + "clip_ratio/high_max": 0.00023284805683942977, + "clip_ratio/high_mean": 8.598793738201493e-05, + "clip_ratio/low_mean": 0.0003366365535839577, + "clip_ratio/low_min": 4.388567595015047e-05, + "clip_ratio/region_mean": 0.00042262449642294087, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 4339.15625, + "completions/mean_terminated_length": 4147.96826171875, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "entropy": 0.41392209380865097, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0042099012061953545, + "learning_rate": 1e-05, + "loss": 0.1212, + "num_tokens": 55056167.0, + "reward": 0.5625, + "reward_std": 0.4991811513900757, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999838471412659, + "sampling/importance_sampling_ratio/min": 0.19788731634616852, + "sampling/sampling_logp_difference/max": 1.8298425674438477, + "sampling/sampling_logp_difference/mean": 0.016492057591676712, + "step": 158 + }, + { + "clip_ratio/high_max": 0.00010216615919489413, + "clip_ratio/high_mean": 2.956860225822311e-05, + "clip_ratio/low_mean": 0.00016890704591787653, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001984756468118576, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11991.0, + "completions/mean_length": 3860.734375, + "completions/mean_terminated_length": 3661.95263671875, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.4760429188609123, + "epoch": 0.07313707451701931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018255900358781219, + "learning_rate": 1e-05, + "loss": -0.0176, + "num_tokens": 55311598.0, + "reward": 0.5625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000277757644653, + "sampling/importance_sampling_ratio/min": 0.14818522334098816, + "sampling/sampling_logp_difference/max": 1.909292221069336, + "sampling/sampling_logp_difference/mean": 0.01704004406929016, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0001849941472755745, + "clip_ratio/high_mean": 4.898101587968995e-05, + "clip_ratio/low_mean": 0.0001777787338141934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002267597501486307, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15737.0, + "completions/mean_length": 5348.65625, + "completions/mean_terminated_length": 4805.93408203125, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 0.39790400862693787, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014152417425066233, + "learning_rate": 1e-05, + "loss": -0.0372, + "num_tokens": 55662544.0, + "reward": 0.546875, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.08356278389692307, + "sampling/sampling_logp_difference/max": 2.482156991958618, + "sampling/sampling_logp_difference/mean": 0.016587980091571808, + "step": 160 + }, + { + "clip_ratio/high_max": 6.86579851389979e-05, + "clip_ratio/high_mean": 1.7164496284749475e-05, + "clip_ratio/low_mean": 0.00020281358229112811, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021997808198648272, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15704.0, + "completions/mean_length": 6315.4375, + "completions/mean_terminated_length": 5644.2001953125, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "entropy": 0.41329350695014, + "epoch": 0.07405703771849126, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0008581069996580482, + "learning_rate": 1e-05, + "loss": -0.078, + "num_tokens": 56076764.0, + "reward": 0.5, + "reward_std": 0.2961388826370239, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999759197235107, + "sampling/importance_sampling_ratio/min": 0.020804718136787415, + "sampling/sampling_logp_difference/max": 3.872575521469116, + "sampling/sampling_logp_difference/mean": 0.015737876296043396, + "step": 161 + }, + { + "clip_ratio/high_max": 0.00016506154497619718, + "clip_ratio/high_mean": 5.038856306782691e-05, + "clip_ratio/low_mean": 7.605237874486193e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00012644094090319413, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15289.0, + "completions/max_terminated_length": 15289.0, + "completions/mean_length": 3972.796875, + "completions/mean_terminated_length": 3972.796875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.3322669602930546, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002519205678254366, + "learning_rate": 1e-05, + "loss": 0.0093, + "num_tokens": 56340543.0, + "reward": 0.71875, + "reward_std": 0.28247910737991333, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 0.4531635046005249, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 0.0027362185064703226, + "sampling/sampling_logp_difference/max": 5.901178359985352, + "sampling/sampling_logp_difference/mean": 0.012547864578664303, + "step": 162 + }, + { + "clip_ratio/high_max": 6.577009116881527e-05, + "clip_ratio/high_mean": 1.6442522792203818e-05, + "clip_ratio/low_mean": 0.0001267316197299806, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014317414297693176, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12751.0, + "completions/max_terminated_length": 12751.0, + "completions/mean_length": 3359.796875, + "completions/mean_terminated_length": 3359.796875, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.4078873060643673, + "epoch": 0.0749770009199632, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031421470921486616, + "learning_rate": 1e-05, + "loss": -0.044, + "num_tokens": 56573618.0, + "reward": 0.46875, + "reward_std": 0.26409149169921875, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 1.9408248662948608, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 0.05682510510087013, + "sampling/sampling_logp_difference/max": 2.867777109146118, + "sampling/sampling_logp_difference/mean": 0.015189846977591515, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0002339555685466621, + "clip_ratio/high_mean": 5.848889213666553e-05, + "clip_ratio/low_mean": 0.00019084552513959352, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002493344186405011, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 5143.953125, + "completions/mean_terminated_length": 4965.5400390625, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.45641181245446205, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013033527648076415, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 56912623.0, + "reward": 0.5, + "reward_std": 0.2540663480758667, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000520944595337, + "sampling/importance_sampling_ratio/min": 0.10672712326049805, + "sampling/sampling_logp_difference/max": 2.2374799251556396, + "sampling/sampling_logp_difference/mean": 0.018095262348651886, + "step": 164 + }, + { + "clip_ratio/high_max": 0.00015645647636119975, + "clip_ratio/high_mean": 5.4849623893460375e-05, + "clip_ratio/low_mean": 0.00022452728444477543, + "clip_ratio/low_min": 1.8452908989274874e-05, + "clip_ratio/region_mean": 0.00027937691174884094, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15524.0, + "completions/max_terminated_length": 15524.0, + "completions/mean_length": 4861.796875, + "completions/mean_terminated_length": 4861.796875, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.3686519227921963, + "epoch": 0.07589696412143514, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002058787737041712, + "learning_rate": 1e-05, + "loss": -0.0061, + "num_tokens": 57231410.0, + "reward": 0.640625, + "reward_std": 0.4434390664100647, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000720024108887, + "sampling/importance_sampling_ratio/min": 0.29414093494415283, + "sampling/sampling_logp_difference/max": 1.223696231842041, + "sampling/sampling_logp_difference/mean": 0.013861306011676788, + "step": 165 + }, + { + "clip_ratio/high_max": 7.898918920545839e-05, + "clip_ratio/high_mean": 1.9747297301364597e-05, + "clip_ratio/low_mean": 0.00023958213569130749, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002593294357211562, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13863.0, + "completions/max_terminated_length": 13863.0, + "completions/mean_length": 3646.5625, + "completions/mean_terminated_length": 3646.5625, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.4185665175318718, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021702114026993513, + "learning_rate": 1e-05, + "loss": -0.0115, + "num_tokens": 57476102.0, + "reward": 0.421875, + "reward_std": 0.34717273712158203, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999930202960968, + "sampling/importance_sampling_ratio/min": 0.26276907324790955, + "sampling/sampling_logp_difference/max": 1.336479663848877, + "sampling/sampling_logp_difference/mean": 0.015390090644359589, + "step": 166 + }, + { + "clip_ratio/high_max": 9.615711496735457e-05, + "clip_ratio/high_mean": 2.4039278741838643e-05, + "clip_ratio/low_mean": 8.294612644021981e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001069854047273111, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12572.0, + "completions/max_terminated_length": 12572.0, + "completions/mean_length": 3261.03125, + "completions/mean_terminated_length": 3261.03125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.38933373615145683, + "epoch": 0.07681692732290708, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019839778542518616, + "learning_rate": 1e-05, + "loss": 0.0163, + "num_tokens": 57698704.0, + "reward": 0.546875, + "reward_std": 0.24039676785469055, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 1.9696807861328125, + "sampling/importance_sampling_ratio/mean": 1.0000805854797363, + "sampling/importance_sampling_ratio/min": 0.23458817601203918, + "sampling/sampling_logp_difference/max": 1.4499237537384033, + "sampling/sampling_logp_difference/mean": 0.013868526555597782, + "step": 167 + }, + { + "clip_ratio/high_max": 0.00010562563329585828, + "clip_ratio/high_mean": 3.383952889635111e-05, + "clip_ratio/low_mean": 0.00020006230874969333, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023390183969240752, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14331.0, + "completions/mean_length": 4422.734375, + "completions/mean_terminated_length": 4036.886962890625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.4031215123832226, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001986635150387883, + "learning_rate": 1e-05, + "loss": 0.0557, + "num_tokens": 57993199.0, + "reward": 0.734375, + "reward_std": 0.4024401307106018, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 0.44515693187713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999784231185913, + "sampling/importance_sampling_ratio/min": 0.06947216391563416, + "sampling/sampling_logp_difference/max": 2.6668291091918945, + "sampling/sampling_logp_difference/mean": 0.015180530957877636, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0001730810577100783, + "clip_ratio/high_mean": 4.773723605921987e-05, + "clip_ratio/low_mean": 0.0002442104923829902, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002919477325349362, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16171.0, + "completions/mean_length": 6199.890625, + "completions/mean_terminated_length": 5871.37060546875, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.5146941468119621, + "epoch": 0.07773689052437903, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0007542786770500243, + "learning_rate": 1e-05, + "loss": 0.1269, + "num_tokens": 58399648.0, + "reward": 0.359375, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.19065004587173462, + "sampling/sampling_logp_difference/max": 1.657315731048584, + "sampling/sampling_logp_difference/mean": 0.018767032772302628, + "step": 169 + }, + { + "clip_ratio/high_max": 9.394133485329803e-05, + "clip_ratio/high_mean": 2.3485333713324508e-05, + "clip_ratio/low_mean": 0.00011168645596626448, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00013517178831534693, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15655.0, + "completions/mean_length": 3896.234375, + "completions/mean_terminated_length": 3698.01611328125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.5078827440738678, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017290591495111585, + "learning_rate": 1e-05, + "loss": 0.0328, + "num_tokens": 58658479.0, + "reward": 0.34375, + "reward_std": 0.3119301199913025, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 7.327488128794357e-05, + "sampling/sampling_logp_difference/max": 9.521292686462402, + "sampling/sampling_logp_difference/mean": 0.017025060951709747, + "step": 170 + }, + { + "clip_ratio/high_max": 0.00020841371315327706, + "clip_ratio/high_mean": 8.523951714778377e-05, + "clip_ratio/low_mean": 0.00012622540452866815, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021146491963008884, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11989.0, + "completions/mean_length": 4735.96875, + "completions/mean_terminated_length": 4360.2255859375, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "entropy": 0.45149439200758934, + "epoch": 0.07865685372585096, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0011273949639871716, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 58972453.0, + "reward": 0.484375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999274611473083, + "sampling/importance_sampling_ratio/min": 0.08670976012945175, + "sampling/sampling_logp_difference/max": 2.4451887607574463, + "sampling/sampling_logp_difference/mean": 0.01583728939294815, + "step": 171 + }, + { + "clip_ratio/high_max": 0.00019820678062387742, + "clip_ratio/high_mean": 7.964204723975854e-05, + "clip_ratio/low_mean": 0.00010668787808754132, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001863299248725525, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14688.0, + "completions/mean_length": 7339.859375, + "completions/mean_terminated_length": 6229.17529296875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.588470846414566, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0008887532167136669, + "learning_rate": 1e-05, + "loss": 0.0996, + "num_tokens": 59451860.0, + "reward": 0.578125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999399185180664, + "sampling/importance_sampling_ratio/min": 0.07705886662006378, + "sampling/sampling_logp_difference/max": 2.563185691833496, + "sampling/sampling_logp_difference/mean": 0.02082424983382225, + "step": 172 + }, + { + "clip_ratio/high_max": 0.00018353125824432936, + "clip_ratio/high_mean": 4.588281456108234e-05, + "clip_ratio/low_mean": 0.0001578366407102294, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002037194581134827, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15312.0, + "completions/mean_length": 5281.125, + "completions/mean_terminated_length": 4132.5517578125, + "completions/min_length": 509.0, + "completions/min_terminated_length": 509.0, + "entropy": 0.4296277277171612, + "epoch": 0.07957681692732291, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001513984752818942, + "learning_rate": 1e-05, + "loss": -0.0775, + "num_tokens": 59799380.0, + "reward": 0.515625, + "reward_std": 0.4024401307106018, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999828934669495, + "sampling/importance_sampling_ratio/min": 0.008889591321349144, + "sampling/sampling_logp_difference/max": 4.722874164581299, + "sampling/sampling_logp_difference/mean": 0.018146134912967682, + "step": 173 + }, + { + "clip_ratio/high_max": 9.688743193692062e-05, + "clip_ratio/high_mean": 2.891019539674744e-05, + "clip_ratio/low_mean": 0.00018989583531947574, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021880603753743344, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15439.0, + "completions/mean_length": 6502.078125, + "completions/mean_terminated_length": 5288.5087890625, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.5399197302758694, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024728032294660807, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 60224913.0, + "reward": 0.578125, + "reward_std": 0.23568853735923767, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999677538871765, + "sampling/importance_sampling_ratio/min": 0.16374215483665466, + "sampling/sampling_logp_difference/max": 1.809462308883667, + "sampling/sampling_logp_difference/mean": 0.018581923097372055, + "step": 174 + }, + { + "clip_ratio/high_max": 0.00013797258361591958, + "clip_ratio/high_mean": 3.965495989177725e-05, + "clip_ratio/low_mean": 0.0002399629820502014, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002796179478536942, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16090.0, + "completions/mean_length": 5760.640625, + "completions/mean_terminated_length": 5417.95166015625, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.4306669682264328, + "epoch": 0.08049678012879485, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0021515442058444023, + "learning_rate": 1e-05, + "loss": 0.0035, + "num_tokens": 60602346.0, + "reward": 0.59375, + "reward_std": 0.3777071237564087, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.07492339611053467, + "sampling/sampling_logp_difference/max": 2.5912890434265137, + "sampling/sampling_logp_difference/mean": 0.018847323954105377, + "step": 175 + }, + { + "clip_ratio/high_max": 0.00012638730822800426, + "clip_ratio/high_mean": 3.932306321985379e-05, + "clip_ratio/low_mean": 0.00021857243700651452, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025789550363697344, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15591.0, + "completions/mean_length": 5001.828125, + "completions/mean_terminated_length": 4442.048828125, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "entropy": 0.386859655380249, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002064446220174432, + "learning_rate": 1e-05, + "loss": 0.0518, + "num_tokens": 60933319.0, + "reward": 0.671875, + "reward_std": 0.4024401307106018, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999225735664368, + "sampling/importance_sampling_ratio/min": 0.001200833241455257, + "sampling/sampling_logp_difference/max": 6.7247395515441895, + "sampling/sampling_logp_difference/mean": 0.015642710030078888, + "step": 176 + }, + { + "clip_ratio/high_max": 0.00010242240023217164, + "clip_ratio/high_mean": 2.9891118629166158e-05, + "clip_ratio/low_mean": 4.038109773318865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 7.027221681710216e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 11816.0, + "completions/max_terminated_length": 11816.0, + "completions/mean_length": 3618.96875, + "completions/mean_terminated_length": 3618.96875, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 0.5046555213630199, + "epoch": 0.08141674333026679, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0018260818906128407, + "learning_rate": 1e-05, + "loss": -0.1102, + "num_tokens": 61174021.0, + "reward": 0.65625, + "reward_std": 0.2925041913986206, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000065803527832, + "sampling/importance_sampling_ratio/min": 0.4336312413215637, + "sampling/sampling_logp_difference/max": 0.8355607986450195, + "sampling/sampling_logp_difference/mean": 0.01464638952165842, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0001466885069021373, + "clip_ratio/high_mean": 4.841328131988121e-05, + "clip_ratio/low_mean": 0.00021246483720460674, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002608781214803457, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15898.0, + "completions/max_terminated_length": 15898.0, + "completions/mean_length": 6726.5625, + "completions/mean_terminated_length": 6726.5625, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "entropy": 0.529183816164732, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0017305320361629128, + "learning_rate": 1e-05, + "loss": 0.1277, + "num_tokens": 61614929.0, + "reward": 0.546875, + "reward_std": 0.398196816444397, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999656081199646, + "sampling/importance_sampling_ratio/min": 0.008894972503185272, + "sampling/sampling_logp_difference/max": 4.722269058227539, + "sampling/sampling_logp_difference/mean": 0.01910485327243805, + "step": 178 + }, + { + "clip_ratio/high_max": 7.97040474935784e-05, + "clip_ratio/high_mean": 1.99260118733946e-05, + "clip_ratio/low_mean": 0.0001781473129085498, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00019807332500931807, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12765.0, + "completions/mean_length": 4369.1875, + "completions/mean_terminated_length": 3778.294921875, + "completions/min_length": 638.0, + "completions/min_terminated_length": 638.0, + "entropy": 0.41936155781149864, + "epoch": 0.08233670653173873, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0009695704793557525, + "learning_rate": 1e-05, + "loss": 0.0105, + "num_tokens": 61905869.0, + "reward": 0.375, + "reward_std": 0.2961388826370239, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000646114349365, + "sampling/importance_sampling_ratio/min": 0.08589151501655579, + "sampling/sampling_logp_difference/max": 2.4546701908111572, + "sampling/sampling_logp_difference/mean": 0.014310698956251144, + "step": 179 + }, + { + "clip_ratio/high_max": 0.00015925395200611092, + "clip_ratio/high_mean": 5.9038932477051276e-05, + "clip_ratio/low_mean": 0.00033926496098501957, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003983038918704551, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13708.0, + "completions/max_terminated_length": 13708.0, + "completions/mean_length": 4426.296875, + "completions/mean_terminated_length": 4426.296875, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "entropy": 0.3644730970263481, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0037218655925244093, + "learning_rate": 1e-05, + "loss": 0.015, + "num_tokens": 62198704.0, + "reward": 0.671875, + "reward_std": 0.38194066286087036, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.17468461394309998, + "sampling/sampling_logp_difference/max": 1.7447731494903564, + "sampling/sampling_logp_difference/mean": 0.01487559825181961, + "step": 180 + }, + { + "clip_ratio/high_max": 8.359176717931405e-05, + "clip_ratio/high_mean": 3.191635573784879e-05, + "clip_ratio/low_mean": 0.0004070220602443442, + "clip_ratio/low_min": 3.077680594287813e-05, + "clip_ratio/region_mean": 0.0004389384193927981, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16178.0, + "completions/mean_length": 5812.25, + "completions/mean_terminated_length": 5292.32763671875, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 0.45648736134171486, + "epoch": 0.08325666973321068, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0027508491184562445, + "learning_rate": 1e-05, + "loss": 0.031, + "num_tokens": 62580064.0, + "reward": 0.421875, + "reward_std": 0.4650121033191681, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000232458114624, + "sampling/importance_sampling_ratio/min": 0.08442410081624985, + "sampling/sampling_logp_difference/max": 2.471902370452881, + "sampling/sampling_logp_difference/mean": 0.016401425004005432, + "step": 181 + }, + { + "clip_ratio/high_max": 4.4125090425950475e-05, + "clip_ratio/high_mean": 1.1031272606487619e-05, + "clip_ratio/low_mean": 7.451805299751868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.554932537663262e-05, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15625.0, + "completions/mean_length": 7307.546875, + "completions/mean_terminated_length": 6219.73681640625, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.6124343201518059, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0018599852919578552, + "learning_rate": 1e-05, + "loss": -0.0063, + "num_tokens": 63064195.0, + "reward": 0.234375, + "reward_std": 0.1315089464187622, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999361038208008, + "sampling/importance_sampling_ratio/min": 2.8124927098360786e-07, + "sampling/sampling_logp_difference/max": 15.084024429321289, + "sampling/sampling_logp_difference/mean": 0.02131102979183197, + "step": 182 + }, + { + "clip_ratio/high_max": 0.00011692623138515046, + "clip_ratio/high_mean": 4.184022350273153e-05, + "clip_ratio/low_mean": 0.00020591235761457938, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024775257406872697, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16140.0, + "completions/mean_length": 6444.59375, + "completions/mean_terminated_length": 5602.27099609375, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.3952566310763359, + "epoch": 0.08417663293468261, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023683710023760796, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 63488401.0, + "reward": 0.671875, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025987625122, + "sampling/importance_sampling_ratio/min": 0.0319228358566761, + "sampling/sampling_logp_difference/max": 3.4444336891174316, + "sampling/sampling_logp_difference/mean": 0.016494080424308777, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0004113621180295013, + "clip_ratio/high_mean": 0.00013114491684973473, + "clip_ratio/low_mean": 0.00016759118079789914, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029873609946662327, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15472.0, + "completions/max_terminated_length": 15472.0, + "completions/mean_length": 4065.046875, + "completions/mean_terminated_length": 4065.046875, + "completions/min_length": 761.0, + "completions/min_terminated_length": 761.0, + "entropy": 0.4868856966495514, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0024668388068675995, + "learning_rate": 1e-05, + "loss": 0.0968, + "num_tokens": 63757052.0, + "reward": 0.59375, + "reward_std": 0.400318443775177, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999004006385803, + "sampling/importance_sampling_ratio/min": 0.17345686256885529, + "sampling/sampling_logp_difference/max": 1.751826286315918, + "sampling/sampling_logp_difference/mean": 0.016633857041597366, + "step": 184 + }, + { + "clip_ratio/high_max": 0.00011292714589217212, + "clip_ratio/high_mean": 3.753670944206533e-05, + "clip_ratio/low_mean": 0.00019003584657184547, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00022757255737815285, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14510.0, + "completions/mean_length": 4484.328125, + "completions/mean_terminated_length": 3899.09814453125, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.42744625359773636, + "epoch": 0.08509659613615456, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025180387310683727, + "learning_rate": 1e-05, + "loss": 0.0264, + "num_tokens": 64056361.0, + "reward": 0.65625, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000048279762268, + "sampling/importance_sampling_ratio/min": 0.10562728345394135, + "sampling/sampling_logp_difference/max": 2.2478384971618652, + "sampling/sampling_logp_difference/mean": 0.016854196786880493, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0001615245364519069, + "clip_ratio/high_mean": 5.795405149910948e-05, + "clip_ratio/low_mean": 0.0002756438070719014, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00033359785993525293, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13914.0, + "completions/mean_length": 6078.625, + "completions/mean_terminated_length": 5205.2880859375, + "completions/min_length": 643.0, + "completions/min_terminated_length": 643.0, + "entropy": 0.44200121238827705, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003769052680581808, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 64456505.0, + "reward": 0.609375, + "reward_std": 0.34246450662612915, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 0.05654711276292801, + "sampling/sampling_logp_difference/max": 2.872681140899658, + "sampling/sampling_logp_difference/mean": 0.019126243889331818, + "step": 186 + }, + { + "clip_ratio/high_max": 7.941897001728648e-05, + "clip_ratio/high_mean": 2.403534449513245e-05, + "clip_ratio/low_mean": 0.00031823053905100096, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003422658846830018, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14135.0, + "completions/mean_length": 5394.234375, + "completions/mean_terminated_length": 4853.75390625, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "entropy": 0.5750439912080765, + "epoch": 0.08601655933762649, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0011502739507704973, + "learning_rate": 1e-05, + "loss": 0.008, + "num_tokens": 64812360.0, + "reward": 0.3125, + "reward_std": 0.3335031569004059, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000646114349365, + "sampling/importance_sampling_ratio/min": 1.883326774532179e-07, + "sampling/sampling_logp_difference/max": 15.485055923461914, + "sampling/sampling_logp_difference/mean": 0.01883636973798275, + "step": 187 + }, + { + "clip_ratio/high_max": 0.00017953455608221702, + "clip_ratio/high_mean": 5.071163013781188e-05, + "clip_ratio/low_mean": 0.00021184172874200158, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00026255335797031876, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11992.0, + "completions/mean_length": 3735.09375, + "completions/mean_terminated_length": 3113.01611328125, + "completions/min_length": 1068.0, + "completions/min_terminated_length": 1068.0, + "entropy": 0.3347371593117714, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005435355007648468, + "learning_rate": 1e-05, + "loss": 0.2343, + "num_tokens": 65060254.0, + "reward": 0.703125, + "reward_std": 0.35141605138778687, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.4604927599430084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000994205474854, + "sampling/importance_sampling_ratio/min": 0.3014819324016571, + "sampling/sampling_logp_difference/max": 1.3178880214691162, + "sampling/sampling_logp_difference/mean": 0.011843510903418064, + "step": 188 + }, + { + "clip_ratio/high_max": 0.00017920508889801567, + "clip_ratio/high_mean": 4.987927036381734e-05, + "clip_ratio/low_mean": 0.0003012043116541463, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035108358360957936, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12396.0, + "completions/max_terminated_length": 12396.0, + "completions/mean_length": 3665.59375, + "completions/mean_terminated_length": 3665.59375, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.3805939368903637, + "epoch": 0.08693652253909843, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0012891854858025908, + "learning_rate": 1e-05, + "loss": 0.0735, + "num_tokens": 65303860.0, + "reward": 0.671875, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483823776245, + "sampling/importance_sampling_ratio/min": 0.1537669152021408, + "sampling/sampling_logp_difference/max": 1.8723173141479492, + "sampling/sampling_logp_difference/mean": 0.014232308603823185, + "step": 189 + }, + { + "clip_ratio/high_max": 0.00025017756706802174, + "clip_ratio/high_mean": 9.517185890217661e-05, + "clip_ratio/low_mean": 0.0002550413046265021, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00035021316580241546, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14472.0, + "completions/max_terminated_length": 14472.0, + "completions/mean_length": 4944.328125, + "completions/mean_terminated_length": 4944.328125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "entropy": 0.4690123051404953, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.005588939413428307, + "learning_rate": 1e-05, + "loss": -0.0124, + "num_tokens": 65629153.0, + "reward": 0.625, + "reward_std": 0.44663429260253906, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999592900276184, + "sampling/importance_sampling_ratio/min": 0.25745633244514465, + "sampling/sampling_logp_difference/max": 1.356905221939087, + "sampling/sampling_logp_difference/mean": 0.018515609204769135, + "step": 190 + }, + { + "clip_ratio/high_max": 0.00018033986088994425, + "clip_ratio/high_mean": 6.675588315374625e-05, + "clip_ratio/low_mean": 0.00043161257781321183, + "clip_ratio/low_min": 7.48055026633665e-05, + "clip_ratio/region_mean": 0.0004983684593753424, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16026.0, + "completions/mean_length": 6431.15625, + "completions/mean_terminated_length": 5587.69482421875, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.42928578704595566, + "epoch": 0.08785648574057038, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0025411290116608143, + "learning_rate": 1e-05, + "loss": 0.1312, + "num_tokens": 66050187.0, + "reward": 0.53125, + "reward_std": 0.3956102132797241, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000171184539795, + "sampling/importance_sampling_ratio/min": 0.025968920439481735, + "sampling/sampling_logp_difference/max": 3.6508548259735107, + "sampling/sampling_logp_difference/mean": 0.018628142774105072, + "step": 191 + }, + { + "clip_ratio/high_max": 0.000127093402625178, + "clip_ratio/high_mean": 5.626492793453508e-05, + "clip_ratio/low_mean": 0.00029342325888137566, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003496881909086369, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15049.0, + "completions/mean_length": 4280.375, + "completions/mean_terminated_length": 3889.935302734375, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "entropy": 0.37104332447052, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002807023236528039, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 66332643.0, + "reward": 0.625, + "reward_std": 0.44663429260253906, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000070571899414, + "sampling/importance_sampling_ratio/min": 0.29366788268089294, + "sampling/sampling_logp_difference/max": 1.2253057956695557, + "sampling/sampling_logp_difference/mean": 0.014485626481473446, + "step": 192 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 66332643, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/latest b/dapo_lora_7b_20251202_002719/checkpoint-256/latest new file mode 100644 index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-256/latest @@ -0,0 +1 @@ +global_step256 \ No newline at end of file diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json b/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-256/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json b/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-256/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py b/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-256/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/README.md b/dapo_lora_7b_20251202_002719/checkpoint-64/README.md new file mode 100644 index 0000000000000000000000000000000000000000..93603607c9fb9b3b4d2aece2cf11d1492643ced8 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-7B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja b/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/latest b/dapo_lora_7b_20251202_002719/checkpoint-64/latest new file mode 100644 index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/latest @@ -0,0 +1 @@ +global_step64 \ No newline at end of file diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json b/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json b/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json b/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a0c2e8c9eeb8bf6080bf55e7d2012e5f33fecdd9 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/trainer_state.json @@ -0,0 +1,2018 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.029438822447102116, + "eval_steps": 500, + "global_step": 64, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16233.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5701.859375, + "completions/mean_terminated_length": 5701.859375, + "completions/min_length": 630.0, + "completions/min_terminated_length": 630.0, + "entropy": 0.35103847086429596, + "epoch": 0.00045998160073597056, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0027150087989866734, + "learning_rate": 1e-05, + "loss": 0.0764, + "num_tokens": 372903.0, + "reward": 0.71875, + "reward_std": 0.4581822156906128, + "rewards/accuracy_reward/mean": 0.71875, + "rewards/accuracy_reward/std": 0.4531635046005249, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000617504119873, + "sampling/importance_sampling_ratio/min": 0.2750210464000702, + "sampling/sampling_logp_difference/max": 1.290907621383667, + "sampling/sampling_logp_difference/mean": 0.01358163170516491, + "step": 1 + }, + { + "clip_ratio/high_max": 0.00010992912939400412, + "clip_ratio/high_mean": 2.748228234850103e-05, + "clip_ratio/low_mean": 0.00016060493635450257, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001880872223409824, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 7385.90625, + "completions/mean_terminated_length": 6455.06884765625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.5675897598266602, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007440462941303849, + "learning_rate": 1e-05, + "loss": -0.0152, + "num_tokens": 856873.0, + "reward": 0.390625, + "reward_std": 0.2198973000049591, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999367594718933, + "sampling/importance_sampling_ratio/min": 0.009396589361131191, + "sampling/sampling_logp_difference/max": 4.667408466339111, + "sampling/sampling_logp_difference/mean": 0.022290317341685295, + "step": 2 + }, + { + "clip_ratio/high_max": 0.00018680206630961038, + "clip_ratio/high_mean": 7.093910403455084e-05, + "clip_ratio/low_mean": 0.0002504906224203296, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032142972168003325, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15091.0, + "completions/mean_length": 5608.828125, + "completions/mean_terminated_length": 5437.7939453125, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "entropy": 0.44635456055402756, + "epoch": 0.0013799448022079118, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002476191846653819, + "learning_rate": 1e-05, + "loss": 0.0755, + "num_tokens": 1225782.0, + "reward": 0.578125, + "reward_std": 0.3776973485946655, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.16118201613426208, + "sampling/sampling_logp_difference/max": 1.825221061706543, + "sampling/sampling_logp_difference/mean": 0.017525848001241684, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0002787337944027968, + "clip_ratio/high_mean": 8.000510115380166e-05, + "clip_ratio/low_mean": 0.00027736531956179533, + "clip_ratio/low_min": 2.338634294574149e-05, + "clip_ratio/region_mean": 0.0003573704316295334, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14515.0, + "completions/max_terminated_length": 14515.0, + "completions/mean_length": 3346.078125, + "completions/mean_terminated_length": 3346.078125, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "entropy": 0.545745424926281, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0037713816855102777, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 1453315.0, + "reward": 0.4375, + "reward_std": 0.4413174092769623, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000426769256592, + "sampling/importance_sampling_ratio/min": 0.08748604357242584, + "sampling/sampling_logp_difference/max": 2.4362759590148926, + "sampling/sampling_logp_difference/mean": 0.016878074035048485, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0002736507922236342, + "clip_ratio/high_mean": 0.00012070279444742482, + "clip_ratio/low_mean": 0.00037263989906932693, + "clip_ratio/low_min": 7.880559132900089e-05, + "clip_ratio/region_mean": 0.0004933426898787729, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15992.0, + "completions/mean_length": 7791.578125, + "completions/mean_terminated_length": 5601.35302734375, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 0.4527555741369724, + "epoch": 0.0022999080036798527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0019191562896594405, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 1962144.0, + "reward": 0.484375, + "reward_std": 0.4987064301967621, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 0.09914527088403702, + "sampling/sampling_logp_difference/max": 2.311169147491455, + "sampling/sampling_logp_difference/mean": 0.019328925758600235, + "step": 5 + }, + { + "clip_ratio/high_max": 0.000247960046181106, + "clip_ratio/high_mean": 6.500758581751143e-05, + "clip_ratio/low_mean": 8.249791471826029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014750550326425582, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15770.0, + "completions/mean_length": 4726.546875, + "completions/mean_terminated_length": 4350.5, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.5126069597899914, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002579454565420747, + "learning_rate": 1e-05, + "loss": -0.0359, + "num_tokens": 2273043.0, + "reward": 0.484375, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999161958694458, + "sampling/importance_sampling_ratio/min": 0.0002888332528527826, + "sampling/sampling_logp_difference/max": 8.14966106414795, + "sampling/sampling_logp_difference/mean": 0.01803017407655716, + "step": 6 + }, + { + "clip_ratio/high_max": 0.00017989838943321956, + "clip_ratio/high_mean": 6.093144725127786e-05, + "clip_ratio/low_mean": 0.00028579145509866066, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003467229043963016, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12743.0, + "completions/mean_length": 7409.0625, + "completions/mean_terminated_length": 6480.62060546875, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "entropy": 0.494194608181715, + "epoch": 0.003219871205151794, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002430765191093087, + "learning_rate": 1e-05, + "loss": 0.0822, + "num_tokens": 2757655.0, + "reward": 0.46875, + "reward_std": 0.40715816617012024, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999563694000244, + "sampling/importance_sampling_ratio/min": 0.17787444591522217, + "sampling/sampling_logp_difference/max": 1.726677417755127, + "sampling/sampling_logp_difference/mean": 0.019815418869256973, + "step": 7 + }, + { + "clip_ratio/high_max": 0.00017167176974908216, + "clip_ratio/high_mean": 6.041262804501457e-05, + "clip_ratio/low_mean": 0.0002822945152729517, + "clip_ratio/low_min": 5.028157829656266e-05, + "clip_ratio/region_mean": 0.00034270713513251394, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13078.0, + "completions/mean_length": 4700.203125, + "completions/mean_terminated_length": 4323.30615234375, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.39490213245153427, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0022012051194906235, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 3072436.0, + "reward": 0.609375, + "reward_std": 0.49446311593055725, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998409152030945, + "sampling/importance_sampling_ratio/min": 0.06603337824344635, + "sampling/sampling_logp_difference/max": 2.717594861984253, + "sampling/sampling_logp_difference/mean": 0.016631681472063065, + "step": 8 + }, + { + "clip_ratio/high_max": 0.00013108045459375717, + "clip_ratio/high_mean": 4.318108904044493e-05, + "clip_ratio/low_mean": 0.00023819861780793872, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002813797018461628, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15940.0, + "completions/mean_length": 5188.890625, + "completions/mean_terminated_length": 4827.7578125, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.43566014245152473, + "epoch": 0.004139834406623735, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0016241734847426414, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 3414085.0, + "reward": 0.59375, + "reward_std": 0.39820659160614014, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 1.9456381797790527, + "sampling/importance_sampling_ratio/mean": 1.0000399351119995, + "sampling/importance_sampling_ratio/min": 0.10360148549079895, + "sampling/sampling_logp_difference/max": 2.2672035694122314, + "sampling/sampling_logp_difference/mean": 0.01550372689962387, + "step": 9 + }, + { + "clip_ratio/high_max": 0.00010115922304976266, + "clip_ratio/high_mean": 2.5289805762440665e-05, + "clip_ratio/low_mean": 0.00034295484147151, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003682446440507192, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 5832.875, + "completions/mean_terminated_length": 5492.51611328125, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "entropy": 0.600818321108818, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0010776554699987173, + "learning_rate": 1e-05, + "loss": -0.0314, + "num_tokens": 3798397.0, + "reward": 0.328125, + "reward_std": 0.37298911809921265, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999531507492065, + "sampling/importance_sampling_ratio/min": 0.0953303873538971, + "sampling/sampling_logp_difference/max": 2.3504066467285156, + "sampling/sampling_logp_difference/mean": 0.020683372393250465, + "step": 10 + }, + { + "clip_ratio/high_max": 0.00030824893383396557, + "clip_ratio/high_mean": 0.00011632417340479151, + "clip_ratio/low_mean": 0.0002341717704439361, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003504959422571119, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15746.0, + "completions/max_terminated_length": 15746.0, + "completions/mean_length": 4986.171875, + "completions/mean_terminated_length": 4986.171875, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.40387310832738876, + "epoch": 0.005059797608095676, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003584277583286166, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 4127424.0, + "reward": 0.671875, + "reward_std": 0.4434390664100647, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998393654823303, + "sampling/importance_sampling_ratio/min": 0.02629905194044113, + "sampling/sampling_logp_difference/max": 3.6382224559783936, + "sampling/sampling_logp_difference/mean": 0.01555373053997755, + "step": 11 + }, + { + "clip_ratio/high_max": 0.00013135069002601085, + "clip_ratio/high_mean": 4.189404148746689e-05, + "clip_ratio/low_mean": 0.00014246321052269195, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018435725178278517, + "completions/clipped_ratio": 0.0, + "completions/max_length": 10079.0, + "completions/max_terminated_length": 10079.0, + "completions/mean_length": 3880.515625, + "completions/mean_terminated_length": 3880.515625, + "completions/min_length": 674.0, + "completions/min_terminated_length": 674.0, + "entropy": 0.4064784087240696, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017852422315627337, + "learning_rate": 1e-05, + "loss": 0.0198, + "num_tokens": 4384473.0, + "reward": 0.671875, + "reward_std": 0.2867126166820526, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999723434448242, + "sampling/importance_sampling_ratio/min": 0.37220701575279236, + "sampling/sampling_logp_difference/max": 0.9883050918579102, + "sampling/sampling_logp_difference/mean": 0.013887828215956688, + "step": 12 + }, + { + "clip_ratio/high_max": 0.00014981444019213086, + "clip_ratio/high_mean": 4.5794572770319064e-05, + "clip_ratio/low_mean": 0.00040218312869910733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00044797768418902706, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16341.0, + "completions/mean_length": 8894.578125, + "completions/mean_terminated_length": 7669.0361328125, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "entropy": 0.5499315299093723, + "epoch": 0.005979760809567618, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004000168293714523, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 4963350.0, + "reward": 0.390625, + "reward_std": 0.2824692726135254, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999991774559021, + "sampling/importance_sampling_ratio/min": 0.047493718564510345, + "sampling/sampling_logp_difference/max": 3.0471577644348145, + "sampling/sampling_logp_difference/mean": 0.02204228937625885, + "step": 13 + }, + { + "clip_ratio/high_max": 0.00018746273144643055, + "clip_ratio/high_mean": 5.583179722634668e-05, + "clip_ratio/low_mean": 0.0001284618601857801, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001842936590037425, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12058.0, + "completions/max_terminated_length": 12058.0, + "completions/mean_length": 4584.0625, + "completions/mean_terminated_length": 4584.0625, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.4566480815410614, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003257408272475004, + "learning_rate": 1e-05, + "loss": -0.0342, + "num_tokens": 5266274.0, + "reward": 0.671875, + "reward_std": 0.3751009702682495, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750256538391, + "sampling/importance_sampling_ratio/min": 0.39602163434028625, + "sampling/sampling_logp_difference/max": 0.9262864589691162, + "sampling/sampling_logp_difference/mean": 0.01598881185054779, + "step": 14 + }, + { + "clip_ratio/high_max": 0.00015991039845175692, + "clip_ratio/high_mean": 5.3697508178629505e-05, + "clip_ratio/low_mean": 0.0003120610426776693, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00036575855119735934, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15009.0, + "completions/mean_length": 5134.671875, + "completions/mean_terminated_length": 4581.42578125, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.41497115045785904, + "epoch": 0.0068997240110395585, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004677772056311369, + "learning_rate": 1e-05, + "loss": 0.05, + "num_tokens": 5603925.0, + "reward": 0.640625, + "reward_std": 0.3913571238517761, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001789331436157, + "sampling/importance_sampling_ratio/min": 0.07364130765199661, + "sampling/sampling_logp_difference/max": 2.608549118041992, + "sampling/sampling_logp_difference/mean": 0.016165096312761307, + "step": 15 + }, + { + "clip_ratio/high_max": 0.00025949142946046777, + "clip_ratio/high_mean": 9.68364292930346e-05, + "clip_ratio/low_mean": 0.000282365266684792, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000379201697796816, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15173.0, + "completions/max_terminated_length": 15173.0, + "completions/mean_length": 4904.96875, + "completions/mean_terminated_length": 4904.96875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.4841916747391224, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002402309561148286, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 5928091.0, + "reward": 0.484375, + "reward_std": 0.41246524453163147, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999504685401917, + "sampling/importance_sampling_ratio/min": 0.0037722671404480934, + "sampling/sampling_logp_difference/max": 5.580079078674316, + "sampling/sampling_logp_difference/mean": 0.018390391021966934, + "step": 16 + }, + { + "clip_ratio/high_max": 6.219606439117342e-05, + "clip_ratio/high_mean": 1.5549016097793356e-05, + "clip_ratio/low_mean": 0.00019023374534299364, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002057827605312923, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13837.0, + "completions/mean_length": 5209.84375, + "completions/mean_terminated_length": 3837.578857421875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3513585068285465, + "epoch": 0.0078196872125115, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019373978720977902, + "learning_rate": 1e-05, + "loss": 0.0016, + "num_tokens": 6271057.0, + "reward": 0.453125, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999862015247345, + "sampling/importance_sampling_ratio/min": 0.1450539529323578, + "sampling/sampling_logp_difference/max": 1.9306495189666748, + "sampling/sampling_logp_difference/mean": 0.013681268319487572, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0001431612308806507, + "clip_ratio/high_mean": 4.711323526862543e-05, + "clip_ratio/low_mean": 9.270217788071022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001398154154230724, + "completions/clipped_ratio": 0.0, + "completions/max_length": 9328.0, + "completions/max_terminated_length": 9328.0, + "completions/mean_length": 2520.640625, + "completions/mean_terminated_length": 2520.640625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.36302734911441803, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027223003562539816, + "learning_rate": 1e-05, + "loss": -0.0416, + "num_tokens": 6441562.0, + "reward": 0.65625, + "reward_std": 0.33090677857398987, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000017762184143, + "sampling/importance_sampling_ratio/min": 0.3734391927719116, + "sampling/sampling_logp_difference/max": 0.9850001335144043, + "sampling/sampling_logp_difference/mean": 0.011676793918013573, + "step": 18 + }, + { + "clip_ratio/high_max": 0.00017718410344969016, + "clip_ratio/high_mean": 5.833459545101505e-05, + "clip_ratio/low_mean": 0.0002528423356125131, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00031117693106352817, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15108.0, + "completions/mean_length": 4240.96875, + "completions/mean_terminated_length": 4048.222412109375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.3896213509142399, + "epoch": 0.008739650413983441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.002503112656995654, + "learning_rate": 1e-05, + "loss": 0.0739, + "num_tokens": 6721568.0, + "reward": 0.59375, + "reward_std": 0.4991811513900757, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999947547912598, + "sampling/importance_sampling_ratio/min": 0.10363919287919998, + "sampling/sampling_logp_difference/max": 2.2668397426605225, + "sampling/sampling_logp_difference/mean": 0.014314994215965271, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0002049997847279883, + "clip_ratio/high_mean": 6.95637043008901e-05, + "clip_ratio/low_mean": 0.00011690972041833447, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018647342039912473, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15322.0, + "completions/mean_length": 3738.484375, + "completions/mean_terminated_length": 3116.573486328125, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.29045598581433296, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002947593806311488, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 6969399.0, + "reward": 0.8125, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward/mean": 0.8125, + "rewards/accuracy_reward/std": 0.39339789748191833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998925924301147, + "sampling/importance_sampling_ratio/min": 0.11472277343273163, + "sampling/sampling_logp_difference/max": 2.165236711502075, + "sampling/sampling_logp_difference/mean": 0.011310569941997528, + "step": 20 + }, + { + "clip_ratio/high_max": 0.00010545731220190646, + "clip_ratio/high_mean": 3.014280719071394e-05, + "clip_ratio/low_mean": 0.00011199774735359824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014214055443062534, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15392.0, + "completions/mean_length": 6065.90625, + "completions/mean_terminated_length": 5191.49169921875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.44125597178936005, + "epoch": 0.009659613615455382, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0011246009962633252, + "learning_rate": 1e-05, + "loss": 0.0021, + "num_tokens": 7365937.0, + "reward": 0.421875, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000147819519043, + "sampling/importance_sampling_ratio/min": 0.25809481739997864, + "sampling/sampling_logp_difference/max": 1.3544282913208008, + "sampling/sampling_logp_difference/mean": 0.017348822206258774, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0003601935495680664, + "clip_ratio/high_mean": 9.941099415300414e-05, + "clip_ratio/low_mean": 0.00034870224044425413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0004481132409637212, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10951.0, + "completions/mean_length": 3722.015625, + "completions/mean_terminated_length": 3521.031982421875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.4340820461511612, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001601650146767497, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 7615658.0, + "reward": 0.5, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998630285263062, + "sampling/importance_sampling_ratio/min": 1.3064802715234691e-06, + "sampling/sampling_logp_difference/max": 13.548173904418945, + "sampling/sampling_logp_difference/mean": 0.016604293137788773, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0002349931419303175, + "clip_ratio/high_mean": 6.471897268056637e-05, + "clip_ratio/low_mean": 0.00014105365880823229, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020577262966980925, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15141.0, + "completions/max_terminated_length": 15141.0, + "completions/mean_length": 3747.484375, + "completions/mean_terminated_length": 3747.484375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.43806017562747, + "epoch": 0.010579576816927323, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017510901670902967, + "learning_rate": 1e-05, + "loss": -0.0391, + "num_tokens": 7867545.0, + "reward": 0.5625, + "reward_std": 0.22461533546447754, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000494718551636, + "sampling/importance_sampling_ratio/min": 0.1432838886976242, + "sampling/sampling_logp_difference/max": 1.942927360534668, + "sampling/sampling_logp_difference/mean": 0.015971330925822258, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0002638470396050252, + "clip_ratio/high_mean": 8.973176045401487e-05, + "clip_ratio/low_mean": 0.0001654990855968208, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002552308424128569, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15089.0, + "completions/mean_length": 4868.609375, + "completions/mean_terminated_length": 4685.82568359375, + "completions/min_length": 1304.0, + "completions/min_terminated_length": 1304.0, + "entropy": 0.3689058944582939, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025512739084661007, + "learning_rate": 1e-05, + "loss": 0.0702, + "num_tokens": 8187720.0, + "reward": 0.625, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942779541016, + "sampling/importance_sampling_ratio/min": 0.21243424713611603, + "sampling/sampling_logp_difference/max": 1.5491228103637695, + "sampling/sampling_logp_difference/mean": 0.01530374214053154, + "step": 24 + }, + { + "clip_ratio/high_max": 0.00016221465284615988, + "clip_ratio/high_mean": 5.93273357480939e-05, + "clip_ratio/low_mean": 0.0003561860394256655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00041551337380951736, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16105.0, + "completions/mean_length": 7169.59375, + "completions/mean_terminated_length": 7023.33349609375, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.5559867396950722, + "epoch": 0.011499540018399264, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0009040784207172692, + "learning_rate": 1e-05, + "loss": 0.0516, + "num_tokens": 8657286.0, + "reward": 0.328125, + "reward_std": 0.2414703518152237, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.244469553232193, + "sampling/sampling_logp_difference/max": 1.4086644649505615, + "sampling/sampling_logp_difference/mean": 0.021266434341669083, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0001577084094606107, + "clip_ratio/high_mean": 4.298096519050887e-05, + "clip_ratio/low_mean": 0.00013108373877912527, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001740647035148868, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15311.0, + "completions/mean_length": 6734.921875, + "completions/mean_terminated_length": 6091.650390625, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.44154683500528336, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002558791544288397, + "learning_rate": 1e-05, + "loss": 0.0372, + "num_tokens": 9099577.0, + "reward": 0.515625, + "reward_std": 0.2777610719203949, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999955296516418, + "sampling/importance_sampling_ratio/min": 0.077813521027565, + "sampling/sampling_logp_difference/max": 2.5534400939941406, + "sampling/sampling_logp_difference/mean": 0.0186590775847435, + "step": 26 + }, + { + "clip_ratio/high_max": 0.00014542990538757294, + "clip_ratio/high_mean": 3.6357476346893236e-05, + "clip_ratio/low_mean": 0.00021458245646499563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00025093993099289946, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15505.0, + "completions/mean_length": 4848.078125, + "completions/mean_terminated_length": 4475.95166015625, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.4912428632378578, + "epoch": 0.012419503219871205, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017661909805610776, + "learning_rate": 1e-05, + "loss": 0.0957, + "num_tokens": 9420006.0, + "reward": 0.515625, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000015139579773, + "sampling/importance_sampling_ratio/min": 0.14381231367588043, + "sampling/sampling_logp_difference/max": 1.9392461776733398, + "sampling/sampling_logp_difference/mean": 0.017206422984600067, + "step": 27 + }, + { + "clip_ratio/high_max": 0.00031798147210793104, + "clip_ratio/high_mean": 0.00010812525488290703, + "clip_ratio/low_mean": 0.00021282920124576776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00032095445021695923, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15477.0, + "completions/mean_length": 5689.8125, + "completions/mean_terminated_length": 5163.86865234375, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.4508574977517128, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0030540244188159704, + "learning_rate": 1e-05, + "loss": 0.0809, + "num_tokens": 9793746.0, + "reward": 0.53125, + "reward_std": 0.42552614212036133, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999897480010986, + "sampling/importance_sampling_ratio/min": 8.414050967076037e-07, + "sampling/sampling_logp_difference/max": 13.988192558288574, + "sampling/sampling_logp_difference/mean": 0.016547517850995064, + "step": 28 + }, + { + "clip_ratio/high_max": 0.00019940425045206212, + "clip_ratio/high_mean": 5.6281104662048165e-05, + "clip_ratio/low_mean": 0.00010776506042020628, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00016404616417275975, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14758.0, + "completions/max_terminated_length": 14758.0, + "completions/mean_length": 3069.78125, + "completions/mean_terminated_length": 3069.78125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.39274851977825165, + "epoch": 0.013339466421343146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034625211264938116, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 10000348.0, + "reward": 0.546875, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000138282775879, + "sampling/importance_sampling_ratio/min": 0.32597410678863525, + "sampling/sampling_logp_difference/max": 1.1209373474121094, + "sampling/sampling_logp_difference/mean": 0.014218954369425774, + "step": 29 + }, + { + "clip_ratio/high_max": 0.00012761429206875619, + "clip_ratio/high_mean": 4.307139124648529e-05, + "clip_ratio/low_mean": 0.00010018590637628222, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00014325729807751486, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16360.0, + "completions/mean_length": 5308.3125, + "completions/mean_terminated_length": 4763.6064453125, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.50441013276577, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.00156789505854249, + "learning_rate": 1e-05, + "loss": 0.0046, + "num_tokens": 10350440.0, + "reward": 0.515625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000964403152466, + "sampling/importance_sampling_ratio/min": 0.04705130681395531, + "sampling/sampling_logp_difference/max": 3.056516647338867, + "sampling/sampling_logp_difference/mean": 0.019430290907621384, + "step": 30 + }, + { + "clip_ratio/high_max": 0.00016632911138003692, + "clip_ratio/high_mean": 5.557040094572585e-05, + "clip_ratio/low_mean": 0.0002778837697405834, + "clip_ratio/low_min": 1.6620682799839415e-05, + "clip_ratio/region_mean": 0.00033345417978125624, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15490.0, + "completions/mean_length": 6388.265625, + "completions/mean_terminated_length": 5354.22412109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.5342313349246979, + "epoch": 0.014259429622815088, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026365246158093214, + "learning_rate": 1e-05, + "loss": 0.0118, + "num_tokens": 10768153.0, + "reward": 0.359375, + "reward_std": 0.31983357667922974, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998627305030823, + "sampling/importance_sampling_ratio/min": 0.26772308349609375, + "sampling/sampling_logp_difference/max": 1.31780207157135, + "sampling/sampling_logp_difference/mean": 0.017920637503266335, + "step": 31 + }, + { + "clip_ratio/high_max": 0.00017989536627283087, + "clip_ratio/high_mean": 5.500852148543345e-05, + "clip_ratio/low_mean": 0.00012964008692506468, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018464860841049813, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14732.0, + "completions/mean_length": 5229.078125, + "completions/mean_terminated_length": 4869.24169921875, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.38906631618738174, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022169759031385183, + "learning_rate": 1e-05, + "loss": 0.0213, + "num_tokens": 11111918.0, + "reward": 0.765625, + "reward_std": 0.3629639744758606, + "rewards/accuracy_reward/mean": 0.765625, + "rewards/accuracy_reward/std": 0.42695629596710205, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.08817384392023087, + "sampling/sampling_logp_difference/max": 2.4284448623657227, + "sampling/sampling_logp_difference/mean": 0.015222044661641121, + "step": 32 + }, + { + "clip_ratio/high_max": 0.00014480652316706255, + "clip_ratio/high_mean": 4.443957550392952e-05, + "clip_ratio/low_mean": 0.00012809812687919475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00017253770374736632, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14850.0, + "completions/mean_length": 5148.453125, + "completions/mean_terminated_length": 4786.01611328125, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "entropy": 0.5083456933498383, + "epoch": 0.01517939282428703, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003128955839201808, + "learning_rate": 1e-05, + "loss": -0.0622, + "num_tokens": 11451323.0, + "reward": 0.53125, + "reward_std": 0.34034284949302673, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025987625122, + "sampling/importance_sampling_ratio/min": 0.10359863191843033, + "sampling/sampling_logp_difference/max": 2.2672312259674072, + "sampling/sampling_logp_difference/mean": 0.017722681164741516, + "step": 33 + }, + { + "clip_ratio/high_max": 5.51352559341467e-05, + "clip_ratio/high_mean": 1.3783813983536675e-05, + "clip_ratio/low_mean": 7.914142133813584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.292523554904619e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13048.0, + "completions/mean_length": 4609.5, + "completions/mean_terminated_length": 3824.533447265625, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.49830054119229317, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007577431970275939, + "learning_rate": 1e-05, + "loss": 0.0132, + "num_tokens": 11758275.0, + "reward": 0.375, + "reward_std": 0.2041158676147461, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998879432678223, + "sampling/importance_sampling_ratio/min": 0.05370701104402542, + "sampling/sampling_logp_difference/max": 2.9242117404937744, + "sampling/sampling_logp_difference/mean": 0.01685405895113945, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0001986039806070039, + "clip_ratio/high_mean": 6.727558275088086e-05, + "clip_ratio/low_mean": 0.0003367365798112587, + "clip_ratio/low_min": 6.28791003691731e-05, + "clip_ratio/region_mean": 0.000404012165745371, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14497.0, + "completions/mean_length": 4593.015625, + "completions/mean_terminated_length": 4013.130859375, + "completions/min_length": 1094.0, + "completions/min_terminated_length": 1094.0, + "entropy": 0.3128826189786196, + "epoch": 0.01609935602575897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0026802816428244114, + "learning_rate": 1e-05, + "loss": 0.1212, + "num_tokens": 12063516.0, + "reward": 0.625, + "reward_std": 0.49234145879745483, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999848008155823, + "sampling/importance_sampling_ratio/min": 0.0008915311773307621, + "sampling/sampling_logp_difference/max": 7.0225701332092285, + "sampling/sampling_logp_difference/mean": 0.01317686028778553, + "step": 35 + }, + { + "clip_ratio/high_max": 7.243978234328097e-05, + "clip_ratio/high_mean": 1.8109945585820242e-05, + "clip_ratio/low_mean": 9.390242212248268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011201236907254497, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16161.0, + "completions/mean_length": 5015.171875, + "completions/mean_terminated_length": 4456.048828125, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 0.37973257526755333, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002345556626096368, + "learning_rate": 1e-05, + "loss": -0.0941, + "num_tokens": 12393103.0, + "reward": 0.640625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000877380371094, + "sampling/importance_sampling_ratio/min": 0.1842055469751358, + "sampling/sampling_logp_difference/max": 1.6917030811309814, + "sampling/sampling_logp_difference/mean": 0.0145792867988348, + "step": 36 + }, + { + "clip_ratio/high_max": 0.00014789494525757618, + "clip_ratio/high_mean": 4.601037198881386e-05, + "clip_ratio/low_mean": 0.0003090670288656838, + "clip_ratio/low_min": 1.8808304957929067e-05, + "clip_ratio/region_mean": 0.00035507740903995, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15632.0, + "completions/mean_length": 5598.484375, + "completions/mean_terminated_length": 5068.048828125, + "completions/min_length": 1283.0, + "completions/min_terminated_length": 1283.0, + "entropy": 0.35928424820303917, + "epoch": 0.01701931922723091, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0015618539182469249, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 12761230.0, + "reward": 0.546875, + "reward_std": 0.4240131676197052, + "rewards/accuracy_reward/mean": 0.546875, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999818801879883, + "sampling/importance_sampling_ratio/min": 0.2261282205581665, + "sampling/sampling_logp_difference/max": 2.6031017303466797, + "sampling/sampling_logp_difference/mean": 0.01447785273194313, + "step": 37 + }, + { + "clip_ratio/high_max": 4.21205932070734e-05, + "clip_ratio/high_mean": 1.053014830176835e-05, + "clip_ratio/low_mean": 4.961071590514621e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.014086420691456e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16037.0, + "completions/mean_length": 5366.125, + "completions/mean_terminated_length": 4824.26220703125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.41980869323015213, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0011855819029733539, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 13115038.0, + "reward": 0.5, + "reward_std": 0.17570312321186066, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999375343322754, + "sampling/importance_sampling_ratio/min": 0.15887950360774994, + "sampling/sampling_logp_difference/max": 1.839609146118164, + "sampling/sampling_logp_difference/mean": 0.015550841577351093, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0003506070097500924, + "clip_ratio/high_mean": 0.00010976320845657028, + "clip_ratio/low_mean": 0.0001256909990843269, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023545420481241308, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15317.0, + "completions/max_terminated_length": 15317.0, + "completions/mean_length": 3308.296875, + "completions/mean_terminated_length": 3308.296875, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.38983067497611046, + "epoch": 0.017939282428702852, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0023375866003334522, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 13335329.0, + "reward": 0.59375, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.59375, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998939037323, + "sampling/importance_sampling_ratio/min": 0.0030945157632231712, + "sampling/sampling_logp_difference/max": 5.77812385559082, + "sampling/sampling_logp_difference/mean": 0.013900299556553364, + "step": 39 + }, + { + "clip_ratio/high_max": 0.000169710889167618, + "clip_ratio/high_mean": 5.673388113791589e-05, + "clip_ratio/low_mean": 0.00029868036835978273, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000355414251316688, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15761.0, + "completions/mean_length": 5426.078125, + "completions/mean_terminated_length": 4497.44091796875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.43789565935730934, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0025193989276885986, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 13691110.0, + "reward": 0.5, + "reward_std": 0.45134252309799194, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.14047929644584656, + "sampling/sampling_logp_difference/max": 1.9626951217651367, + "sampling/sampling_logp_difference/mean": 0.015961986035108566, + "step": 40 + }, + { + "clip_ratio/high_max": 8.76178437465569e-05, + "clip_ratio/high_mean": 2.3123878236219753e-05, + "clip_ratio/low_mean": 0.00019285815869807266, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002159820378437871, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16374.0, + "completions/mean_length": 4766.140625, + "completions/mean_terminated_length": 4194.77001953125, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 0.47973789647221565, + "epoch": 0.018859245630174794, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0005962434806860983, + "learning_rate": 1e-05, + "loss": 0.0018, + "num_tokens": 14006911.0, + "reward": 0.484375, + "reward_std": 0.2382849156856537, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.12198832631111145, + "sampling/sampling_logp_difference/max": 2.103829860687256, + "sampling/sampling_logp_difference/mean": 0.016915298998355865, + "step": 41 + }, + { + "clip_ratio/high_max": 6.694088551739696e-05, + "clip_ratio/high_mean": 2.3428712665918283e-05, + "clip_ratio/low_mean": 0.0002706102432057378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002940389586001402, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14691.0, + "completions/mean_length": 5922.421875, + "completions/mean_terminated_length": 4637.66650390625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.42647283896803856, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001872243476100266, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 14394946.0, + "reward": 0.4375, + "reward_std": 0.36295416951179504, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999639987945557, + "sampling/importance_sampling_ratio/min": 0.293357253074646, + "sampling/sampling_logp_difference/max": 2.1049091815948486, + "sampling/sampling_logp_difference/mean": 0.01656758040189743, + "step": 42 + }, + { + "clip_ratio/high_max": 0.00015323197931138566, + "clip_ratio/high_mean": 4.9833591447168146e-05, + "clip_ratio/low_mean": 0.00034982425768248504, + "clip_ratio/low_min": 1.088660519599216e-05, + "clip_ratio/region_mean": 0.0003996578489022795, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16255.0, + "completions/mean_length": 6493.1875, + "completions/mean_terminated_length": 6006.75390625, + "completions/min_length": 479.0, + "completions/min_terminated_length": 479.0, + "entropy": 0.4782983772456646, + "epoch": 0.019779208831646734, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.00166318379342556, + "learning_rate": 1e-05, + "loss": 0.0511, + "num_tokens": 14821182.0, + "reward": 0.46875, + "reward_std": 0.4092700183391571, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998992085456848, + "sampling/importance_sampling_ratio/min": 1.7716387219479657e-06, + "sampling/sampling_logp_difference/max": 13.243605613708496, + "sampling/sampling_logp_difference/mean": 0.018610000610351562, + "step": 43 + }, + { + "clip_ratio/high_max": 6.034070747773512e-05, + "clip_ratio/high_mean": 1.6863068026395922e-05, + "clip_ratio/low_mean": 9.460987712373026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00011147294480906567, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16269.0, + "completions/mean_length": 4648.546875, + "completions/mean_terminated_length": 4269.98388671875, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.4597437307238579, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0008557081455364823, + "learning_rate": 1e-05, + "loss": 0.069, + "num_tokens": 15128561.0, + "reward": 0.328125, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 0.2670474946498871, + "sampling/sampling_logp_difference/max": 1.320328712463379, + "sampling/sampling_logp_difference/mean": 0.016183078289031982, + "step": 44 + }, + { + "clip_ratio/high_max": 0.00016895902081159875, + "clip_ratio/high_mean": 6.0399999711080454e-05, + "clip_ratio/low_mean": 0.0002296717866556719, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029007178636675235, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15720.0, + "completions/mean_length": 6930.234375, + "completions/mean_terminated_length": 6129.06787109375, + "completions/min_length": 682.0, + "completions/min_terminated_length": 682.0, + "entropy": 0.5115556567907333, + "epoch": 0.020699172033118676, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016648141900077462, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 15582168.0, + "reward": 0.5625, + "reward_std": 0.3424547016620636, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.5, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000483989715576, + "sampling/importance_sampling_ratio/min": 0.187262162566185, + "sampling/sampling_logp_difference/max": 1.937586784362793, + "sampling/sampling_logp_difference/mean": 0.019788919016718864, + "step": 45 + }, + { + "clip_ratio/high_max": 9.100124134420184e-05, + "clip_ratio/high_mean": 3.351398640916159e-05, + "clip_ratio/low_mean": 0.000253890422754921, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002874044093914563, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16321.0, + "completions/mean_length": 6264.671875, + "completions/mean_terminated_length": 5938.24169921875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.43167873099446297, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013617790536955, + "learning_rate": 1e-05, + "loss": 0.0032, + "num_tokens": 15994715.0, + "reward": 0.640625, + "reward_std": 0.3766237497329712, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.4836103618144989, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.1620832085609436, + "sampling/sampling_logp_difference/max": 1.8196454048156738, + "sampling/sampling_logp_difference/mean": 0.017889156937599182, + "step": 46 + }, + { + "clip_ratio/high_max": 6.15748222116963e-05, + "clip_ratio/high_mean": 1.870576988949324e-05, + "clip_ratio/low_mean": 0.0003191337254975224, + "clip_ratio/low_min": 4.877414176007733e-05, + "clip_ratio/region_mean": 0.0003378394994797418, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12838.0, + "completions/mean_length": 4168.140625, + "completions/mean_terminated_length": 3974.23828125, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 0.433504331856966, + "epoch": 0.021619135234590615, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003133355872705579, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 16272044.0, + "reward": 0.34375, + "reward_std": 0.3377465009689331, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998913407325745, + "sampling/importance_sampling_ratio/min": 0.38697248697280884, + "sampling/sampling_logp_difference/max": 1.4266910552978516, + "sampling/sampling_logp_difference/mean": 0.014272443950176239, + "step": 47 + }, + { + "clip_ratio/high_max": 5.0198698772874195e-05, + "clip_ratio/high_mean": 1.2549674693218549e-05, + "clip_ratio/low_mean": 0.00024944932374637574, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002619989991217153, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14382.0, + "completions/mean_length": 5228.15625, + "completions/mean_terminated_length": 4868.2900390625, + "completions/min_length": 1099.0, + "completions/min_terminated_length": 1099.0, + "entropy": 0.6134471148252487, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002945883432403207, + "learning_rate": 1e-05, + "loss": 0.0237, + "num_tokens": 16616510.0, + "reward": 0.453125, + "reward_std": 0.39560043811798096, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000022053718567, + "sampling/importance_sampling_ratio/min": 0.23671367764472961, + "sampling/sampling_logp_difference/max": 1.4409040212631226, + "sampling/sampling_logp_difference/mean": 0.01892893575131893, + "step": 48 + }, + { + "clip_ratio/high_max": 0.00010992094757966697, + "clip_ratio/high_mean": 3.773104890569812e-05, + "clip_ratio/low_mean": 0.0002085948569856555, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002463259042997379, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15580.0, + "completions/max_terminated_length": 15580.0, + "completions/mean_length": 4286.90625, + "completions/mean_terminated_length": 4286.90625, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.3194341119378805, + "epoch": 0.022539098436062558, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033129912335425615, + "learning_rate": 1e-05, + "loss": -0.0135, + "num_tokens": 16903128.0, + "reward": 0.578125, + "reward_std": 0.4113916754722595, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000847578048706, + "sampling/importance_sampling_ratio/min": 0.14042755961418152, + "sampling/sampling_logp_difference/max": 1.9630634784698486, + "sampling/sampling_logp_difference/mean": 0.0129241943359375, + "step": 49 + }, + { + "clip_ratio/high_max": 0.00010812897107825847, + "clip_ratio/high_mean": 3.162783127663715e-05, + "clip_ratio/low_mean": 0.0001828691292757867, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00021449696214403957, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15964.0, + "completions/mean_length": 5032.125, + "completions/mean_terminated_length": 4070.101806640625, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "entropy": 0.4777919165790081, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021068111527711153, + "learning_rate": 1e-05, + "loss": -0.0866, + "num_tokens": 17236504.0, + "reward": 0.515625, + "reward_std": 0.29826053977012634, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5037065148353577, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 0.2832590341567993, + "sampling/sampling_logp_difference/max": 1.8220746517181396, + "sampling/sampling_logp_difference/mean": 0.01738543063402176, + "step": 50 + }, + { + "clip_ratio/high_max": 0.00012820017036574427, + "clip_ratio/high_mean": 3.647331323008984e-05, + "clip_ratio/low_mean": 0.00025561100665072445, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002920843198808143, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13201.0, + "completions/mean_length": 4803.203125, + "completions/mean_terminated_length": 4619.38134765625, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.4494751952588558, + "epoch": 0.023459061637534497, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028032760601490736, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 17553269.0, + "reward": 0.609375, + "reward_std": 0.3403330445289612, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999998152256012, + "sampling/importance_sampling_ratio/min": 0.21100811660289764, + "sampling/sampling_logp_difference/max": 1.5558586120605469, + "sampling/sampling_logp_difference/mean": 0.01737060397863388, + "step": 51 + }, + { + "clip_ratio/high_max": 0.00010267168681821204, + "clip_ratio/high_mean": 3.3487939049337e-05, + "clip_ratio/low_mean": 0.00015384274320240365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00018733068225174065, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16160.0, + "completions/mean_length": 7100.3125, + "completions/mean_terminated_length": 6643.7373046875, + "completions/min_length": 1183.0, + "completions/min_terminated_length": 1183.0, + "entropy": 0.5009776279330254, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001591994776390493, + "learning_rate": 1e-05, + "loss": -0.0421, + "num_tokens": 18016729.0, + "reward": 0.453125, + "reward_std": 0.28930896520614624, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 0.09941783547401428, + "sampling/sampling_logp_difference/max": 2.3084237575531006, + "sampling/sampling_logp_difference/mean": 0.01882891170680523, + "step": 52 + }, + { + "clip_ratio/high_max": 0.00016665930297676823, + "clip_ratio/high_mean": 5.2525359819810546e-05, + "clip_ratio/low_mean": 0.0004211304803902749, + "clip_ratio/low_min": 9.529018279863521e-05, + "clip_ratio/region_mean": 0.0004736558298645832, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14227.0, + "completions/mean_length": 6233.796875, + "completions/mean_terminated_length": 5557.1171875, + "completions/min_length": 1338.0, + "completions/min_terminated_length": 1338.0, + "entropy": 0.48881014063954353, + "epoch": 0.02437902483900644, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003694011364132166, + "learning_rate": 1e-05, + "loss": 0.1627, + "num_tokens": 18426140.0, + "reward": 0.625, + "reward_std": 0.3977220952510834, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 0.20072485506534576, + "sampling/sampling_logp_difference/max": 1.6058201789855957, + "sampling/sampling_logp_difference/mean": 0.01879170536994934, + "step": 53 + }, + { + "clip_ratio/high_max": 0.00012100895446565119, + "clip_ratio/high_mean": 4.9377299660591234e-05, + "clip_ratio/low_mean": 0.00019421957949816715, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00024359687631658744, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15854.0, + "completions/mean_length": 5629.03125, + "completions/mean_terminated_length": 5282.0966796875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.3631018362939358, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001484633656218648, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 18794958.0, + "reward": 0.609375, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.609375, + "rewards/accuracy_reward/std": 0.4917473793029785, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000042200088501, + "sampling/importance_sampling_ratio/min": 0.002677773591130972, + "sampling/sampling_logp_difference/max": 5.922769546508789, + "sampling/sampling_logp_difference/mean": 0.013976464979350567, + "step": 54 + }, + { + "clip_ratio/high_max": 0.00021361040307965595, + "clip_ratio/high_mean": 8.756921079111635e-05, + "clip_ratio/low_mean": 0.0002042179089585261, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00029178711429267423, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 5366.453125, + "completions/mean_terminated_length": 5191.57177734375, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.34573371335864067, + "epoch": 0.025298988040478382, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0018017840338870883, + "learning_rate": 1e-05, + "loss": -0.0307, + "num_tokens": 19148275.0, + "reward": 0.734375, + "reward_std": 0.4050365090370178, + "rewards/accuracy_reward/mean": 0.734375, + "rewards/accuracy_reward/std": 0.44515693187713623, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999160766601562, + "sampling/importance_sampling_ratio/min": 0.22769968211650848, + "sampling/sampling_logp_difference/max": 1.4797277450561523, + "sampling/sampling_logp_difference/mean": 0.014456957578659058, + "step": 55 + }, + { + "clip_ratio/high_max": 0.00020042336745973444, + "clip_ratio/high_mean": 5.850923639627581e-05, + "clip_ratio/low_mean": 0.00019344742031535134, + "clip_ratio/low_min": 1.594387686054688e-05, + "clip_ratio/region_mean": 0.0002519566587579902, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15942.0, + "completions/mean_length": 5801.921875, + "completions/mean_terminated_length": 5460.564453125, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "entropy": 0.4420101195573807, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0007390208193100989, + "learning_rate": 1e-05, + "loss": 0.0368, + "num_tokens": 19530718.0, + "reward": 0.421875, + "reward_std": 0.2993341088294983, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999010562896729, + "sampling/importance_sampling_ratio/min": 0.04691341519355774, + "sampling/sampling_logp_difference/max": 3.0594515800476074, + "sampling/sampling_logp_difference/mean": 0.016371876001358032, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0001929260479300865, + "clip_ratio/high_mean": 7.267188334481034e-05, + "clip_ratio/low_mean": 0.00013643273086927366, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020910461648782075, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 3581.09375, + "completions/mean_terminated_length": 3581.09375, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "entropy": 0.36750902235507965, + "epoch": 0.02621895124195032, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020201546140015125, + "learning_rate": 1e-05, + "loss": 0.1245, + "num_tokens": 19771076.0, + "reward": 0.578125, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000082015991211, + "sampling/importance_sampling_ratio/min": 0.21508392691612244, + "sampling/sampling_logp_difference/max": 2.204270362854004, + "sampling/sampling_logp_difference/mean": 0.013558689504861832, + "step": 57 + }, + { + "clip_ratio/high_max": 0.00019395453546167118, + "clip_ratio/high_mean": 6.426821187233145e-05, + "clip_ratio/low_mean": 0.00017469121939939214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00023895943377283402, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14150.0, + "completions/max_terminated_length": 14150.0, + "completions/mean_length": 4180.46875, + "completions/mean_terminated_length": 4180.46875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.4649594761431217, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0028552189469337463, + "learning_rate": 1e-05, + "loss": 0.0924, + "num_tokens": 20048138.0, + "reward": 0.53125, + "reward_std": 0.4276576042175293, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5029674172401428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000083446502686, + "sampling/importance_sampling_ratio/min": 0.2393883913755417, + "sampling/sampling_logp_difference/max": 1.4296680688858032, + "sampling/sampling_logp_difference/mean": 0.017490293830633163, + "step": 58 + }, + { + "clip_ratio/high_max": 0.00014915554584149504, + "clip_ratio/high_mean": 3.9898490058476455e-05, + "clip_ratio/low_mean": 5.383538700698409e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 9.373387524647114e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15323.0, + "completions/max_terminated_length": 15323.0, + "completions/mean_length": 4642.15625, + "completions/mean_terminated_length": 4642.15625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.41386983543634415, + "epoch": 0.027138914443422264, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014837872004136443, + "learning_rate": 1e-05, + "loss": -0.0232, + "num_tokens": 20355020.0, + "reward": 0.65625, + "reward_std": 0.3198433816432953, + "rewards/accuracy_reward/mean": 0.65625, + "rewards/accuracy_reward/std": 0.4787135720252991, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001411437988281, + "sampling/importance_sampling_ratio/min": 0.022514859214425087, + "sampling/sampling_logp_difference/max": 3.7935798168182373, + "sampling/sampling_logp_difference/mean": 0.015344480983912945, + "step": 59 + }, + { + "clip_ratio/high_max": 7.379077214864083e-05, + "clip_ratio/high_mean": 2.223373576271115e-05, + "clip_ratio/low_mean": 0.00013174474815969006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001539784839224012, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15170.0, + "completions/max_terminated_length": 15170.0, + "completions/mean_length": 3369.015625, + "completions/mean_terminated_length": 3369.015625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.46293293312191963, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023857210762798786, + "learning_rate": 1e-05, + "loss": 0.0587, + "num_tokens": 20579309.0, + "reward": 0.40625, + "reward_std": 0.29143065214157104, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999541640281677, + "sampling/importance_sampling_ratio/min": 0.00012647465337067842, + "sampling/sampling_logp_difference/max": 8.975468635559082, + "sampling/sampling_logp_difference/mean": 0.016323832795023918, + "step": 60 + }, + { + "clip_ratio/high_max": 0.00010131701310456265, + "clip_ratio/high_mean": 3.068578371312469e-05, + "clip_ratio/low_mean": 0.00017564234258315992, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0002063281253867899, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 4186.265625, + "completions/mean_terminated_length": 3992.651123046875, + "completions/min_length": 636.0, + "completions/min_terminated_length": 636.0, + "entropy": 0.4424850195646286, + "epoch": 0.028058877644894203, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.001888959901407361, + "learning_rate": 1e-05, + "loss": -0.0867, + "num_tokens": 20858230.0, + "reward": 0.5, + "reward_std": 0.43401283025741577, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5039526224136353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001115798950195, + "sampling/importance_sampling_ratio/min": 0.21523967385292053, + "sampling/sampling_logp_difference/max": 1.5360031127929688, + "sampling/sampling_logp_difference/mean": 0.015638090670108795, + "step": 61 + }, + { + "clip_ratio/high_max": 0.00018883940902014729, + "clip_ratio/high_mean": 6.83412895341462e-05, + "clip_ratio/low_mean": 0.00029582804199890234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0003641693292593118, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15784.0, + "completions/mean_length": 8232.328125, + "completions/mean_terminated_length": 7231.24560546875, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "entropy": 0.4720785431563854, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0010464832885190845, + "learning_rate": 1e-05, + "loss": 0.0678, + "num_tokens": 21394763.0, + "reward": 0.421875, + "reward_std": 0.30617380142211914, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436736106873, + "sampling/importance_sampling_ratio/min": 0.05187493562698364, + "sampling/sampling_logp_difference/max": 2.9589195251464844, + "sampling/sampling_logp_difference/mean": 0.019340507686138153, + "step": 62 + }, + { + "clip_ratio/high_max": 7.807558085914934e-05, + "clip_ratio/high_mean": 2.2267657527663687e-05, + "clip_ratio/low_mean": 0.0001811299157452595, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00020339757793408353, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15783.0, + "completions/mean_length": 6065.875, + "completions/mean_terminated_length": 5558.42578125, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "entropy": 0.5249982811510563, + "epoch": 0.028978840846366146, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016154105542227626, + "learning_rate": 1e-05, + "loss": 0.1536, + "num_tokens": 21793091.0, + "reward": 0.40625, + "reward_std": 0.2756394147872925, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998780488967896, + "sampling/importance_sampling_ratio/min": 0.05374135076999664, + "sampling/sampling_logp_difference/max": 2.923572540283203, + "sampling/sampling_logp_difference/mean": 0.017961012199521065, + "step": 63 + }, + { + "clip_ratio/high_max": 3.358934282005066e-05, + "clip_ratio/high_mean": 8.397335705012665e-06, + "clip_ratio/low_mean": 3.994480266555911e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.834213746107707e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15670.0, + "completions/mean_length": 5830.015625, + "completions/mean_terminated_length": 5489.564453125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.49247242510318756, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0013925280654802918, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 22176908.0, + "reward": 0.375, + "reward_std": 0.1872510462999344, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000190734863281, + "sampling/importance_sampling_ratio/min": 0.00015296634228434414, + "sampling/sampling_logp_difference/max": 8.785292625427246, + "sampling/sampling_logp_difference/mean": 0.016575772315263748, + "step": 64 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 22176908, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py b/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_7b_20251202_002719/checkpoint-64/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/README.md b/dapo_lora_plus_20251202_001141/checkpoint-128/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/latest b/dapo_lora_plus_20251202_001141/checkpoint-128/latest new file mode 100644 index 0000000000000000000000000000000000000000..b4db7fb020d9ef75e52048bf0cde7481e3ef9351 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/latest @@ -0,0 +1 @@ +global_step128 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7554bec987cbd24bb2cef715f0fe73e0a1ecbcbd --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/trainer_state.json @@ -0,0 +1,4002 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11775528978840846, + "eval_steps": 500, + "global_step": 128, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 96676847, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-128/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/README.md b/dapo_lora_plus_20251202_001141/checkpoint-192/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/latest b/dapo_lora_plus_20251202_001141/checkpoint-192/latest new file mode 100644 index 0000000000000000000000000000000000000000..36721df7ef9c6f050f37be6e76b3d130ed5cbfc7 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/latest @@ -0,0 +1 @@ +global_step192 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..86474473dc82b1cdb8c5cd9c25cfca00610f917a --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/trainer_state.json @@ -0,0 +1,5986 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1766329346826127, + "eval_steps": 500, + "global_step": 192, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + }, + { + "clip_ratio/high_max": 1.8439853647578275e-05, + "clip_ratio/high_mean": 4.609963411894569e-06, + "clip_ratio/low_mean": 5.708034223061986e-05, + "clip_ratio/low_min": 2.75287948170444e-06, + "clip_ratio/region_mean": 6.169030598357494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15081.0, + "completions/mean_length": 6565.359375, + "completions/mean_terminated_length": 6488.04736328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1013468354940414, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019073591101914644, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 97539453.0, + "reward": 0.2734375, + "reward_std": 0.307217001914978, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999555945396423, + "sampling/importance_sampling_ratio/min": 0.0006022047018632293, + "sampling/sampling_logp_difference/max": 7.414913177490234, + "sampling/sampling_logp_difference/mean": 0.02150837704539299, + "step": 129 + }, + { + "clip_ratio/high_max": 9.068485269381199e-06, + "clip_ratio/high_mean": 2.2671213173452998e-06, + "clip_ratio/low_mean": 1.9822365402433206e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.208948649240483e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 6779.6171875, + "completions/mean_terminated_length": 6703.9921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8940552547574043, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010163087863475084, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 98429036.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 3.464699460664633e-08, + "sampling/sampling_logp_difference/max": 17.178054809570312, + "sampling/sampling_logp_difference/mean": 0.018716152757406235, + "step": 130 + }, + { + "clip_ratio/high_max": 5.047242211730918e-06, + "clip_ratio/high_mean": 1.2618105529327295e-06, + "clip_ratio/low_mean": 2.9014110396019532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0275920835265424e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14549.0, + "completions/max_terminated_length": 14549.0, + "completions/mean_length": 5766.71875, + "completions/mean_terminated_length": 5766.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 1.0455922111868858, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002155766822397709, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 99184264.0, + "reward": 0.4140625, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253749847412, + "sampling/importance_sampling_ratio/min": 0.00010798005678225309, + "sampling/sampling_logp_difference/max": 9.133563995361328, + "sampling/sampling_logp_difference/mean": 0.020948775112628937, + "step": 131 + }, + { + "clip_ratio/high_max": 2.0882574972347356e-05, + "clip_ratio/high_mean": 6.505383225885453e-06, + "clip_ratio/low_mean": 4.496008500609605e-05, + "clip_ratio/low_min": 7.757854064038838e-06, + "clip_ratio/region_mean": 5.1465468231981504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14704.0, + "completions/mean_length": 6167.2421875, + "completions/mean_terminated_length": 6005.07177734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.9100174158811569, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021464223973453045, + "learning_rate": 1e-05, + "loss": -0.0279, + "num_tokens": 99996831.0, + "reward": 0.421875, + "reward_std": 0.3916535973548889, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240040779114, + "sampling/importance_sampling_ratio/min": 0.02249590866267681, + "sampling/sampling_logp_difference/max": 3.794421911239624, + "sampling/sampling_logp_difference/mean": 0.01866895705461502, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0998018473837874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0998018473837874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15738.0, + "completions/mean_length": 6242.9453125, + "completions/mean_terminated_length": 6163.09423828125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8624134212732315, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023277695290744305, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 100814112.0, + "reward": 0.3984375, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999959409236908, + "sampling/importance_sampling_ratio/min": 0.0002393616596236825, + "sampling/sampling_logp_difference/max": 8.33753490447998, + "sampling/sampling_logp_difference/mean": 0.0191188994795084, + "step": 133 + }, + { + "clip_ratio/high_max": 6.589872555196052e-06, + "clip_ratio/high_mean": 1.647468138799013e-06, + "clip_ratio/low_mean": 4.329304238126497e-05, + "clip_ratio/low_min": 3.5120251595799346e-06, + "clip_ratio/region_mean": 4.494051017900347e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5733.6875, + "completions/mean_terminated_length": 5478.080078125, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9628067463636398, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003547821193933487, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 101566264.0, + "reward": 0.3984375, + "reward_std": 0.36584997177124023, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0001282035664189607, + "sampling/sampling_logp_difference/max": 8.961891174316406, + "sampling/sampling_logp_difference/mean": 0.019646761938929558, + "step": 134 + }, + { + "clip_ratio/high_max": 1.7107527582993498e-05, + "clip_ratio/high_mean": 4.2768818957483745e-06, + "clip_ratio/low_mean": 3.014796902789385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.442485103732906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15848.0, + "completions/max_terminated_length": 15848.0, + "completions/mean_length": 5505.9375, + "completions/mean_terminated_length": 5505.9375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.8041045889258385, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024891747161746025, + "learning_rate": 1e-05, + "loss": 0.1406, + "num_tokens": 102291456.0, + "reward": 0.5, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 0.0014627616619691253, + "sampling/sampling_logp_difference/max": 6.527429103851318, + "sampling/sampling_logp_difference/mean": 0.01716250739991665, + "step": 135 + }, + { + "clip_ratio/high_max": 1.548903105685895e-05, + "clip_ratio/high_mean": 3.872257764214737e-06, + "clip_ratio/low_mean": 5.380711581892683e-05, + "clip_ratio/low_min": 4.5777483137499075e-06, + "clip_ratio/region_mean": 5.767937363998499e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16005.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 5003.0625, + "completions/mean_terminated_length": 5003.0625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.9115714654326439, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00220683915540576, + "learning_rate": 1e-05, + "loss": 0.1361, + "num_tokens": 102949824.0, + "reward": 0.4140625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 8.323705696966499e-05, + "sampling/sampling_logp_difference/max": 9.393817901611328, + "sampling/sampling_logp_difference/mean": 0.018076512962579727, + "step": 136 + }, + { + "clip_ratio/high_max": 2.181136096623959e-05, + "clip_ratio/high_mean": 5.4528402415598975e-06, + "clip_ratio/low_mean": 3.4416837252138066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986967681157694e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15658.0, + "completions/max_terminated_length": 15658.0, + "completions/mean_length": 4742.1328125, + "completions/mean_terminated_length": 4742.1328125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.9430246204137802, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003964806906878948, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 103580913.0, + "reward": 0.4609375, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 7.031940185697749e-05, + "sampling/sampling_logp_difference/max": 9.56246280670166, + "sampling/sampling_logp_difference/mean": 0.019651200622320175, + "step": 137 + }, + { + "clip_ratio/high_max": 4.07684046876966e-06, + "clip_ratio/high_mean": 1.019210117192415e-06, + "clip_ratio/low_mean": 3.8682398553646635e-05, + "clip_ratio/low_min": 8.189203072106466e-06, + "clip_ratio/region_mean": 3.970160832977854e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6574.171875, + "completions/mean_terminated_length": 6091.72119140625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8429529070854187, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002067410387098789, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 104447463.0, + "reward": 0.3125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997583627700806, + "sampling/importance_sampling_ratio/min": 0.00021258489869069308, + "sampling/sampling_logp_difference/max": 8.456169128417969, + "sampling/sampling_logp_difference/mean": 0.018853647634387016, + "step": 138 + }, + { + "clip_ratio/high_max": 1.9725823221961036e-05, + "clip_ratio/high_mean": 4.931455805490259e-06, + "clip_ratio/low_mean": 5.9263072444082354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.419452870431996e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15518.0, + "completions/max_terminated_length": 15518.0, + "completions/mean_length": 4581.5625, + "completions/mean_terminated_length": 4581.5625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.7094272822141647, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004292502999305725, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 105052287.0, + "reward": 0.625, + "reward_std": 0.3908300995826721, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.0019342642044648528, + "sampling/sampling_logp_difference/max": 6.24802827835083, + "sampling/sampling_logp_difference/mean": 0.016310662031173706, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0132298029930098e-05, + "clip_ratio/high_mean": 2.5330745074825245e-06, + "clip_ratio/low_mean": 4.6397121650443296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.893019581686531e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16097.0, + "completions/mean_length": 7066.4453125, + "completions/mean_terminated_length": 6918.5478515625, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8481669947504997, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015785128343850374, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 105977048.0, + "reward": 0.3515625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.00104097044095397, + "sampling/sampling_logp_difference/max": 6.8676018714904785, + "sampling/sampling_logp_difference/mean": 0.018304405733942986, + "step": 140 + }, + { + "clip_ratio/high_max": 1.6989023606583942e-05, + "clip_ratio/high_mean": 4.2472559016459854e-06, + "clip_ratio/low_mean": 2.3075059743860038e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7322315418132348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16104.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 6230.5234375, + "completions/mean_terminated_length": 6230.5234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9658062160015106, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002542720176279545, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 106793187.0, + "reward": 0.3203125, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.0002781494113150984, + "sampling/sampling_logp_difference/max": 8.187352180480957, + "sampling/sampling_logp_difference/mean": 0.019391046836972237, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7597974508353218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7597974508353218e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14216.0, + "completions/mean_length": 5690.5546875, + "completions/mean_terminated_length": 5606.3544921875, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 1.0098655670881271, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001451602904126048, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 107539874.0, + "reward": 0.4296875, + "reward_std": 0.23304283618927002, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999307990074158, + "sampling/importance_sampling_ratio/min": 5.640022671116185e-09, + "sampling/sampling_logp_difference/max": 18.993377685546875, + "sampling/sampling_logp_difference/mean": 0.018607191741466522, + "step": 142 + }, + { + "clip_ratio/high_max": 1.2800467629858758e-05, + "clip_ratio/high_mean": 4.19954119479371e-06, + "clip_ratio/low_mean": 2.350350996493944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.770305115973315e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15791.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5471.1328125, + "completions/mean_terminated_length": 5471.1328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0413162112236023, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023549250327050686, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 108260091.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999832510948181, + "sampling/importance_sampling_ratio/min": 0.0011709182290360332, + "sampling/sampling_logp_difference/max": 6.749967098236084, + "sampling/sampling_logp_difference/mean": 0.020427243784070015, + "step": 143 + }, + { + "clip_ratio/high_max": 2.1983064925734652e-05, + "clip_ratio/high_mean": 5.495766231433663e-06, + "clip_ratio/low_mean": 4.361141452591255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9107180757346214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6211.7421875, + "completions/mean_terminated_length": 6050.2783203125, + "completions/min_length": 622.0, + "completions/min_terminated_length": 622.0, + "entropy": 0.9706784337759018, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017527056625112891, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 109073890.0, + "reward": 0.421875, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.002898645820096135, + "sampling/sampling_logp_difference/max": 5.843511581420898, + "sampling/sampling_logp_difference/mean": 0.018898162990808487, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.208964992358233e-05, + "clip_ratio/low_min": 3.9168990042526275e-06, + "clip_ratio/region_mean": 4.208964992358233e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14880.0, + "completions/mean_length": 6007.8984375, + "completions/mean_terminated_length": 5926.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.1967609524726868, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007858420140109956, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 109861813.0, + "reward": 0.296875, + "reward_std": 0.23486506938934326, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 3.294382011631569e-08, + "sampling/sampling_logp_difference/max": 17.22846221923828, + "sampling/sampling_logp_difference/mean": 0.021845955401659012, + "step": 145 + }, + { + "clip_ratio/high_max": 4.5118208618077915e-06, + "clip_ratio/high_mean": 1.1279552154519479e-06, + "clip_ratio/low_mean": 3.749712686840212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8625082197540905e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6800.9921875, + "completions/mean_terminated_length": 6725.53564453125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.0437887012958527, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029428249690681696, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 110756572.0, + "reward": 0.265625, + "reward_std": 0.3248382806777954, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.0006329434108920395, + "sampling/sampling_logp_difference/max": 7.365129470825195, + "sampling/sampling_logp_difference/mean": 0.02010120078921318, + "step": 146 + }, + { + "clip_ratio/high_max": 1.427700522071973e-05, + "clip_ratio/high_mean": 3.5692513051799324e-06, + "clip_ratio/low_mean": 4.964020990883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320946092979284e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6309.4453125, + "completions/mean_terminated_length": 6230.1181640625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.9768906533718109, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002088683657348156, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 111585493.0, + "reward": 0.375, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.009723234921693802, + "sampling/sampling_logp_difference/max": 4.633236885070801, + "sampling/sampling_logp_difference/mean": 0.020927833393216133, + "step": 147 + }, + { + "clip_ratio/high_max": 5.4841398196003865e-06, + "clip_ratio/high_mean": 1.3710349549000966e-06, + "clip_ratio/low_mean": 5.122006064084417e-05, + "clip_ratio/low_min": 3.785125954891555e-06, + "clip_ratio/region_mean": 5.25910957094311e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15209.0, + "completions/mean_length": 6221.859375, + "completions/mean_terminated_length": 6060.5556640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.9212924689054489, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002406956860795617, + "learning_rate": 1e-05, + "loss": 0.1051, + "num_tokens": 112400363.0, + "reward": 0.40625, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05, + "sampling/sampling_logp_difference/max": 9.74976634979248, + "sampling/sampling_logp_difference/mean": 0.018652018159627914, + "step": 148 + }, + { + "clip_ratio/high_max": 1.4568151755156578e-05, + "clip_ratio/high_mean": 3.6420379387891444e-06, + "clip_ratio/low_mean": 3.999794398623635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3639981413434725e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14997.0, + "completions/mean_length": 6942.8203125, + "completions/mean_terminated_length": 6716.232421875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.949538916349411, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022962254006415606, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 113308748.0, + "reward": 0.375, + "reward_std": 0.3329663872718811, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.00048810525913722813, + "sampling/sampling_logp_difference/max": 7.624979496002197, + "sampling/sampling_logp_difference/mean": 0.01939917355775833, + "step": 149 + }, + { + "clip_ratio/high_max": 8.786732450971613e-06, + "clip_ratio/high_mean": 2.196683112742903e-06, + "clip_ratio/low_mean": 5.562954720517155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7826231113722315e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15182.0, + "completions/mean_length": 6783.1796875, + "completions/mean_terminated_length": 6552.76025390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9774708449840546, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020560629200190306, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 114196235.0, + "reward": 0.34375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998990297317505, + "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07, + "sampling/sampling_logp_difference/max": 15.211536407470703, + "sampling/sampling_logp_difference/mean": 0.019691556692123413, + "step": 150 + }, + { + "clip_ratio/high_max": 1.799483243303257e-05, + "clip_ratio/high_mean": 4.498708108258143e-06, + "clip_ratio/low_mean": 2.6389980291696702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0888688343111426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15549.0, + "completions/mean_length": 5568.15625, + "completions/mean_terminated_length": 5396.4765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.9303529411554337, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022214846685528755, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 114928047.0, + "reward": 0.234375, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05, + "sampling/sampling_logp_difference/max": 10.749968528747559, + "sampling/sampling_logp_difference/mean": 0.01938418298959732, + "step": 151 + }, + { + "clip_ratio/high_max": 1.1957493370573502e-05, + "clip_ratio/high_mean": 2.9893733426433755e-06, + "clip_ratio/low_mean": 5.885063319510664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.184000585562899e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15340.0, + "completions/max_terminated_length": 15340.0, + "completions/mean_length": 6086.578125, + "completions/mean_terminated_length": 6086.578125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.9131873697042465, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002448044717311859, + "learning_rate": 1e-05, + "loss": 0.0599, + "num_tokens": 115725657.0, + "reward": 0.40625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999779462814331, + "sampling/importance_sampling_ratio/min": 0.02929726243019104, + "sampling/sampling_logp_difference/max": 3.530261278152466, + "sampling/sampling_logp_difference/mean": 0.019298439845442772, + "step": 152 + }, + { + "clip_ratio/high_max": 1.3385357760853367e-05, + "clip_ratio/high_mean": 3.3463394402133417e-06, + "clip_ratio/low_mean": 5.717015119444113e-05, + "clip_ratio/low_min": 3.4328400033700746e-06, + "clip_ratio/region_mean": 6.0516490520967636e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 6442.5390625, + "completions/mean_terminated_length": 6203.9443359375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.8959419652819633, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002013204852119088, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 116571478.0, + "reward": 0.2734375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06, + "sampling/sampling_logp_difference/max": 13.778777122497559, + "sampling/sampling_logp_difference/mean": 0.01925014518201351, + "step": 153 + }, + { + "clip_ratio/high_max": 9.34224021875707e-06, + "clip_ratio/high_mean": 3.136903728773177e-06, + "clip_ratio/low_mean": 2.9738095065567904e-05, + "clip_ratio/low_min": 3.7240065466903616e-06, + "clip_ratio/region_mean": 3.2874999135401595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6633.5703125, + "completions/mean_terminated_length": 6319.0400390625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0223619118332863, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024523327592760324, + "learning_rate": 1e-05, + "loss": 0.056, + "num_tokens": 117440743.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05, + "sampling/sampling_logp_difference/max": 10.413415908813477, + "sampling/sampling_logp_difference/mean": 0.02061290666460991, + "step": 154 + }, + { + "clip_ratio/high_max": 1.4537483366439119e-05, + "clip_ratio/high_mean": 3.6343708416097797e-06, + "clip_ratio/low_mean": 3.954866042477079e-05, + "clip_ratio/low_min": 9.874949228105834e-06, + "clip_ratio/region_mean": 4.318303126638057e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15919.0, + "completions/mean_length": 7183.0, + "completions/mean_terminated_length": 6886.193359375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.9815369099378586, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018688985146582127, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 118380687.0, + "reward": 0.2890625, + "reward_std": 0.2498900145292282, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039173126221, + "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05, + "sampling/sampling_logp_difference/max": 11.187394142150879, + "sampling/sampling_logp_difference/mean": 0.019792160019278526, + "step": 155 + }, + { + "clip_ratio/high_max": 7.165636361605721e-06, + "clip_ratio/high_mean": 1.7914090904014301e-06, + "clip_ratio/low_mean": 4.9011068711024564e-05, + "clip_ratio/low_min": 1.0991705721608014e-05, + "clip_ratio/region_mean": 5.0802477687739156e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6324.640625, + "completions/mean_terminated_length": 5829.91748046875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.852975606918335, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002005894435569644, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 119207089.0, + "reward": 0.3984375, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 5.788659223071591e-07, + "sampling/sampling_logp_difference/max": 14.362195014953613, + "sampling/sampling_logp_difference/mean": 0.01853565312922001, + "step": 156 + }, + { + "clip_ratio/high_max": 7.795394822096569e-06, + "clip_ratio/high_mean": 1.948848705524142e-06, + "clip_ratio/low_mean": 3.834237736555224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0291225786859286e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 5723.421875, + "completions/mean_terminated_length": 5290.06494140625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8744911625981331, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002577397273853421, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 119961895.0, + "reward": 0.390625, + "reward_std": 0.34321609139442444, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.07882421463727951, + "sampling/sampling_logp_difference/max": 2.5405349731445312, + "sampling/sampling_logp_difference/mean": 0.018341556191444397, + "step": 157 + }, + { + "clip_ratio/high_max": 9.214097190124448e-06, + "clip_ratio/high_mean": 2.303524297531112e-06, + "clip_ratio/low_mean": 2.636873176697918e-05, + "clip_ratio/low_min": 2.9339967113628518e-06, + "clip_ratio/region_mean": 2.8672255837136618e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7886.015625, + "completions/mean_terminated_length": 7682.064453125, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.9391767829656601, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002552987542003393, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 120990289.0, + "reward": 0.328125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.000899312668479979, + "sampling/sampling_logp_difference/max": 7.013879776000977, + "sampling/sampling_logp_difference/mean": 0.02049873024225235, + "step": 158 + }, + { + "clip_ratio/high_max": 3.406416203688423e-05, + "clip_ratio/high_mean": 9.72330332160709e-06, + "clip_ratio/low_mean": 3.168332909808669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140663151019908e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6173.1640625, + "completions/mean_terminated_length": 6011.087890625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.9148785546422005, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002678362652659416, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 121797958.0, + "reward": 0.4140625, + "reward_std": 0.3608373999595642, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.002013920107856393, + "sampling/sampling_logp_difference/max": 6.207672119140625, + "sampling/sampling_logp_difference/mean": 0.018977735191583633, + "step": 159 + }, + { + "clip_ratio/high_max": 1.8476588593330234e-05, + "clip_ratio/high_mean": 4.6191471483325586e-06, + "clip_ratio/low_mean": 4.459614581264759e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9215293188353826e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 6594.21875, + "completions/mean_terminated_length": 6196.259765625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9486038386821747, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033711253199726343, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 122661170.0, + "reward": 0.3828125, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981356620789, + "sampling/importance_sampling_ratio/min": 0.0002968419576063752, + "sampling/sampling_logp_difference/max": 8.122310638427734, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 160 + }, + { + "clip_ratio/high_max": 7.97335997049231e-06, + "clip_ratio/high_mean": 2.7343705824023345e-06, + "clip_ratio/low_mean": 5.420079878604156e-05, + "clip_ratio/low_min": 4.594068286678521e-06, + "clip_ratio/region_mean": 5.693517005056492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 6533.9453125, + "completions/mean_terminated_length": 6377.595703125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9986584335565567, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017857529455795884, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 123518107.0, + "reward": 0.34375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998549818992615, + "sampling/importance_sampling_ratio/min": 9.012701411847956e-06, + "sampling/sampling_logp_difference/max": 11.616875648498535, + "sampling/sampling_logp_difference/mean": 0.02010391652584076, + "step": 161 + }, + { + "clip_ratio/high_max": 4.470512521947967e-06, + "clip_ratio/high_mean": 1.1176281304869917e-06, + "clip_ratio/low_mean": 3.5141094485879876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.625872295742738e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13212.0, + "completions/mean_length": 5742.21875, + "completions/mean_terminated_length": 5658.42529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0379670709371567, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018227624241262674, + "learning_rate": 1e-05, + "loss": -0.0237, + "num_tokens": 124279031.0, + "reward": 0.21875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.0020977305248379707, + "sampling/sampling_logp_difference/max": 6.16689920425415, + "sampling/sampling_logp_difference/mean": 0.019987668842077255, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0003542683989508e-05, + "clip_ratio/high_mean": 3.21091931709816e-06, + "clip_ratio/low_mean": 5.731009014198207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0521009800140746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7584.703125, + "completions/mean_terminated_length": 7515.41748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.953459307551384, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002219022251665592, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 125270761.0, + "reward": 0.359375, + "reward_std": 0.37033066153526306, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880790710449, + "sampling/importance_sampling_ratio/min": 0.0024849213659763336, + "sampling/sampling_logp_difference/max": 5.997514247894287, + "sampling/sampling_logp_difference/mean": 0.020291510969400406, + "step": 163 + }, + { + "clip_ratio/high_max": 7.734669452474918e-06, + "clip_ratio/high_mean": 1.9336673631187296e-06, + "clip_ratio/low_mean": 3.1135301298945706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3068968605221016e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 4714.671875, + "completions/mean_terminated_length": 4622.78759765625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.018719919025898, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014189074281603098, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 125895279.0, + "reward": 0.3984375, + "reward_std": 0.28383445739746094, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479651451111, + "sampling/importance_sampling_ratio/min": 4.017410901724361e-07, + "sampling/sampling_logp_difference/max": 14.727458000183105, + "sampling/sampling_logp_difference/mean": 0.018739396706223488, + "step": 164 + }, + { + "clip_ratio/high_max": 1.0069575182569679e-05, + "clip_ratio/high_mean": 2.5173937956424197e-06, + "clip_ratio/low_mean": 3.824179225375701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0759185367278405e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15913.0, + "completions/mean_length": 6316.140625, + "completions/mean_terminated_length": 6074.51220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.9325072392821312, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001702460227534175, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 126722881.0, + "reward": 0.4609375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999539852142334, + "sampling/importance_sampling_ratio/min": 0.0012551364488899708, + "sampling/sampling_logp_difference/max": 6.680510997772217, + "sampling/sampling_logp_difference/mean": 0.01929408684372902, + "step": 165 + }, + { + "clip_ratio/high_max": 6.873041002108948e-06, + "clip_ratio/high_mean": 1.718260250527237e-06, + "clip_ratio/low_mean": 3.119859468370123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.291685527528898e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15832.0, + "completions/mean_length": 4687.140625, + "completions/mean_terminated_length": 4595.03955078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 1.0886607319116592, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032931750174611807, + "learning_rate": 1e-05, + "loss": 0.0078, + "num_tokens": 127341715.0, + "reward": 0.28125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.0019364450126886368, + "sampling/sampling_logp_difference/max": 6.246901512145996, + "sampling/sampling_logp_difference/mean": 0.020621225237846375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.773085250533768e-05, + "clip_ratio/high_mean": 4.43271312633442e-06, + "clip_ratio/low_mean": 4.30743207289197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7507033741567284e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14125.0, + "completions/mean_length": 5705.515625, + "completions/mean_terminated_length": 5449.232421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0523068830370903, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031696646474301815, + "learning_rate": 1e-05, + "loss": -0.0414, + "num_tokens": 128093597.0, + "reward": 0.1953125, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619126319885, + "sampling/importance_sampling_ratio/min": 3.197810656274669e-05, + "sampling/sampling_logp_difference/max": 10.350459098815918, + "sampling/sampling_logp_difference/mean": 0.021961934864521027, + "step": 167 + }, + { + "clip_ratio/high_max": 1.885905066956184e-05, + "clip_ratio/high_mean": 4.71476266739046e-06, + "clip_ratio/low_mean": 5.0530389898995054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.524515336219338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15958.0, + "completions/mean_length": 6214.4921875, + "completions/mean_terminated_length": 6053.07177734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.9371421113610268, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023704832419753075, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 128906948.0, + "reward": 0.40625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.0003354824730195105, + "sampling/sampling_logp_difference/max": 7.999940872192383, + "sampling/sampling_logp_difference/mean": 0.01882763020694256, + "step": 168 + }, + { + "clip_ratio/high_max": 3.042072216885572e-05, + "clip_ratio/high_mean": 7.60518054221393e-06, + "clip_ratio/low_mean": 4.5897569179942366e-05, + "clip_ratio/low_min": 8.727477506909054e-06, + "clip_ratio/region_mean": 5.3502750233747065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7127.0703125, + "completions/mean_terminated_length": 7054.18115234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.9854387491941452, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003370177699252963, + "learning_rate": 1e-05, + "loss": 0.1197, + "num_tokens": 129839813.0, + "reward": 0.359375, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999907910823822, + "sampling/importance_sampling_ratio/min": 1.077816432371037e-05, + "sampling/sampling_logp_difference/max": 11.43798828125, + "sampling/sampling_logp_difference/mean": 0.019736800342798233, + "step": 169 + }, + { + "clip_ratio/high_max": 2.1401074718596647e-05, + "clip_ratio/high_mean": 6.243764005375851e-06, + "clip_ratio/low_mean": 3.2797592325550795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.904135610355297e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 6566.2890625, + "completions/mean_terminated_length": 6330.6640625, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.7978609576821327, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026055986527353525, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 130698370.0, + "reward": 0.5, + "reward_std": 0.36295419931411743, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.00031152591691352427, + "sampling/sampling_logp_difference/max": 8.074028015136719, + "sampling/sampling_logp_difference/mean": 0.01787097379565239, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0564424403346493e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0564424403346493e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15576.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7186.2890625, + "completions/mean_terminated_length": 7186.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.0232757329940796, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0023866184055805206, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 131637439.0, + "reward": 0.2734375, + "reward_std": 0.2059282809495926, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207258224487, + "sampling/importance_sampling_ratio/min": 0.0007378471200354397, + "sampling/sampling_logp_difference/max": 7.211773872375488, + "sampling/sampling_logp_difference/mean": 0.02137116715312004, + "step": 171 + }, + { + "clip_ratio/high_max": 4.037900725961663e-05, + "clip_ratio/high_mean": 1.0094751814904157e-05, + "clip_ratio/low_mean": 5.8380828136250784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.847557995115494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 5591.5703125, + "completions/mean_terminated_length": 5420.26220703125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9335208311676979, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003491115989163518, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 132371816.0, + "reward": 0.5, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891459941864, + "sampling/importance_sampling_ratio/min": 0.00012356207298580557, + "sampling/sampling_logp_difference/max": 8.998766899108887, + "sampling/sampling_logp_difference/mean": 0.018760837614536285, + "step": 172 + }, + { + "clip_ratio/high_max": 2.8378776733006816e-06, + "clip_ratio/high_mean": 7.094694183251704e-07, + "clip_ratio/low_mean": 4.4085751369493664e-05, + "clip_ratio/low_min": 6.7955093072669115e-06, + "clip_ratio/region_mean": 4.4795220674132e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7152.3828125, + "completions/mean_terminated_length": 6930.82421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.1329835206270218, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002830669516697526, + "learning_rate": 1e-05, + "loss": 0.0526, + "num_tokens": 133307297.0, + "reward": 0.28125, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.00028047082014381886, + "sampling/sampling_logp_difference/max": 8.179040908813477, + "sampling/sampling_logp_difference/mean": 0.021548541262745857, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0150829439226072e-05, + "clip_ratio/high_mean": 2.537707359806518e-06, + "clip_ratio/low_mean": 3.4009618616437365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654732597624388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 7263.453125, + "completions/mean_terminated_length": 7118.68310546875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.092760555446148, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027821618132293224, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 134260107.0, + "reward": 0.3203125, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 7.832317351130769e-05, + "sampling/sampling_logp_difference/max": 9.454667091369629, + "sampling/sampling_logp_difference/mean": 0.022098438814282417, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0561876024439698e-05, + "clip_ratio/high_mean": 2.6404690061099245e-06, + "clip_ratio/low_mean": 1.6864279416495265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9504748649978865e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15388.0, + "completions/mean_length": 7088.8125, + "completions/mean_terminated_length": 6710.958984375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 1.0669445469975471, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007076738984324038, + "learning_rate": 1e-05, + "loss": -0.0197, + "num_tokens": 135186139.0, + "reward": 0.328125, + "reward_std": 0.20593319833278656, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 3.084653872065246e-05, + "sampling/sampling_logp_difference/max": 10.386486053466797, + "sampling/sampling_logp_difference/mean": 0.020075790584087372, + "step": 175 + }, + { + "clip_ratio/high_max": 7.095016371749807e-06, + "clip_ratio/high_mean": 1.7737540929374518e-06, + "clip_ratio/low_mean": 2.7592465016823553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.936621888238733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15626.0, + "completions/max_terminated_length": 15626.0, + "completions/mean_length": 5352.734375, + "completions/mean_terminated_length": 5352.734375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 1.0387161895632744, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0022445612121373415, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 135888929.0, + "reward": 0.4765625, + "reward_std": 0.399257630109787, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 0.00032565294532105327, + "sampling/sampling_logp_difference/max": 8.029678344726562, + "sampling/sampling_logp_difference/mean": 0.02010166086256504, + "step": 176 + }, + { + "clip_ratio/high_max": 1.5100852124305675e-05, + "clip_ratio/high_mean": 4.426987970873597e-06, + "clip_ratio/low_mean": 2.7625993425317574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2052981168817496e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 7758.90625, + "completions/mean_terminated_length": 7408.29248046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 1.0648984238505363, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022021254990249872, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 136901941.0, + "reward": 0.3671875, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858140945435, + "sampling/importance_sampling_ratio/min": 2.2461865967216e-07, + "sampling/sampling_logp_difference/max": 15.30886173248291, + "sampling/sampling_logp_difference/mean": 0.021426808089017868, + "step": 177 + }, + { + "clip_ratio/high_max": 2.5346608254039893e-05, + "clip_ratio/high_mean": 7.4063813144675805e-06, + "clip_ratio/low_mean": 2.2069365058996482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9475746259777225e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 7036.953125, + "completions/mean_terminated_length": 6496.21484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9684997871518135, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013461806811392307, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 137824623.0, + "reward": 0.34375, + "reward_std": 0.2546031177043915, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 5.834372132085264e-05, + "sampling/sampling_logp_difference/max": 9.74915885925293, + "sampling/sampling_logp_difference/mean": 0.020304443314671516, + "step": 178 + }, + { + "clip_ratio/high_max": 1.3147734080121154e-05, + "clip_ratio/high_mean": 3.2869335200302885e-06, + "clip_ratio/low_mean": 4.841489999307669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.170183294467279e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6114.1875, + "completions/mean_terminated_length": 5951.1748046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.943072073161602, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132438588887453, + "learning_rate": 1e-05, + "loss": 0.0943, + "num_tokens": 138625247.0, + "reward": 0.40625, + "reward_std": 0.321650892496109, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999298453330994, + "sampling/importance_sampling_ratio/min": 0.0017275095451623201, + "sampling/sampling_logp_difference/max": 6.361074447631836, + "sampling/sampling_logp_difference/mean": 0.020084267482161522, + "step": 179 + }, + { + "clip_ratio/high_max": 1.7873157958092634e-05, + "clip_ratio/high_mean": 4.468289489523158e-06, + "clip_ratio/low_mean": 3.5252990301160025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9721279790683184e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15050.0, + "completions/mean_length": 7618.875, + "completions/mean_terminated_length": 7034.53369140625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.9142575263977051, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026741649489849806, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 139619287.0, + "reward": 0.2890625, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.005949751473963261, + "sampling/sampling_logp_difference/max": 5.124405860900879, + "sampling/sampling_logp_difference/mean": 0.020061582326889038, + "step": 180 + }, + { + "clip_ratio/high_max": 1.0512151675357018e-05, + "clip_ratio/high_mean": 2.6280379188392544e-06, + "clip_ratio/low_mean": 4.5301517502593924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.792955542143318e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16106.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 5333.875, + "completions/mean_terminated_length": 5333.875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "entropy": 0.8107482865452766, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027016003150492907, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 140318935.0, + "reward": 0.5703125, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.006856904830783606, + "sampling/sampling_logp_difference/max": 4.982499122619629, + "sampling/sampling_logp_difference/mean": 0.017069874331355095, + "step": 181 + }, + { + "clip_ratio/high_max": 1.85085939392593e-05, + "clip_ratio/high_mean": 5.24943533264377e-06, + "clip_ratio/low_mean": 5.6120721524166584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.137015702734061e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7443.3046875, + "completions/mean_terminated_length": 7154.89501953125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 0.9224414080381393, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002655779244378209, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 141293534.0, + "reward": 0.234375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.00042018835665658116, + "sampling/sampling_logp_difference/max": 7.774807453155518, + "sampling/sampling_logp_difference/mean": 0.02006504125893116, + "step": 182 + }, + { + "clip_ratio/high_max": 1.494229445597739e-05, + "clip_ratio/high_mean": 3.7355736139943474e-06, + "clip_ratio/low_mean": 2.2748562741981004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6484136355975352e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 5646.6875, + "completions/mean_terminated_length": 5646.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8945339694619179, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016281780553981662, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 142037438.0, + "reward": 0.46875, + "reward_std": 0.17912296950817108, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.0005717006279155612, + "sampling/sampling_logp_difference/max": 7.46689510345459, + "sampling/sampling_logp_difference/mean": 0.019336247816681862, + "step": 183 + }, + { + "clip_ratio/high_max": 3.335990868436056e-05, + "clip_ratio/high_mean": 8.33997717109014e-06, + "clip_ratio/low_mean": 3.5050728683927446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339070608239126e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14142.0, + "completions/mean_length": 6384.640625, + "completions/mean_terminated_length": 5892.86865234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.840093269944191, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002166559686884284, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 142873848.0, + "reward": 0.4765625, + "reward_std": 0.35506346821784973, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 4.785555574926548e-06, + "sampling/sampling_logp_difference/max": 12.249908447265625, + "sampling/sampling_logp_difference/mean": 0.018109092488884926, + "step": 184 + }, + { + "clip_ratio/high_max": 1.541105484648142e-05, + "clip_ratio/high_mean": 3.852763711620355e-06, + "clip_ratio/low_mean": 4.0552770769863855e-05, + "clip_ratio/low_min": 7.133888630050933e-06, + "clip_ratio/region_mean": 4.440553459517105e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14828.0, + "completions/mean_length": 5775.0, + "completions/mean_terminated_length": 5691.46435546875, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "entropy": 0.8915362879633904, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021932912059128284, + "learning_rate": 1e-05, + "loss": -0.0086, + "num_tokens": 143636152.0, + "reward": 0.4375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 9.714113069492214e-09, + "sampling/sampling_logp_difference/max": 18.44968605041504, + "sampling/sampling_logp_difference/mean": 0.019278086721897125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7509142171311396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7509142171311396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6181.640625, + "completions/mean_terminated_length": 6019.69873046875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 1.0544511675834656, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022947140969336033, + "learning_rate": 1e-05, + "loss": 0.0242, + "num_tokens": 144447370.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147653579712, + "sampling/importance_sampling_ratio/min": 7.419757253046555e-08, + "sampling/sampling_logp_difference/max": 16.416534423828125, + "sampling/sampling_logp_difference/mean": 0.02050788700580597, + "step": 186 + }, + { + "clip_ratio/high_max": 1.5700999938417226e-05, + "clip_ratio/high_mean": 3.9252499846043065e-06, + "clip_ratio/low_mean": 2.4595847037289786e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8521096965050674e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 6542.3046875, + "completions/mean_terminated_length": 6306.1044921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.933225467801094, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034910975955426693, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 145303505.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.007213745731860399, + "sampling/sampling_logp_difference/max": 4.931766986846924, + "sampling/sampling_logp_difference/mean": 0.020022759214043617, + "step": 187 + }, + { + "clip_ratio/high_max": 6.0999414017715026e-06, + "clip_ratio/high_mean": 1.5249853504428756e-06, + "clip_ratio/low_mean": 2.61421698724007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7667155109156738e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 5889.4765625, + "completions/mean_terminated_length": 5637.6083984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.9649673849344254, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024078311398625374, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 146082198.0, + "reward": 0.3359375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999341368675232, + "sampling/importance_sampling_ratio/min": 0.0008680344326421618, + "sampling/sampling_logp_difference/max": 7.04927921295166, + "sampling/sampling_logp_difference/mean": 0.02060198038816452, + "step": 188 + }, + { + "clip_ratio/high_max": 7.789618393871933e-06, + "clip_ratio/high_mean": 1.9474045984679833e-06, + "clip_ratio/low_mean": 3.6395756637830345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.834316100892465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5349.2421875, + "completions/mean_terminated_length": 5084.408203125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8402756005525589, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021191861014813185, + "learning_rate": 1e-05, + "loss": 0.1275, + "num_tokens": 146786245.0, + "reward": 0.4765625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 3.763807762879878e-05, + "sampling/sampling_logp_difference/max": 10.187494277954102, + "sampling/sampling_logp_difference/mean": 0.017112664878368378, + "step": 189 + }, + { + "clip_ratio/high_max": 1.2461773394534248e-05, + "clip_ratio/high_mean": 3.115443348633562e-06, + "clip_ratio/low_mean": 5.095924211673264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4074685294835945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 7272.3203125, + "completions/mean_terminated_length": 7053.64013671875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.9627499282360077, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022120666690170765, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 147737086.0, + "reward": 0.2890625, + "reward_std": 0.27304792404174805, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05, + "sampling/sampling_logp_difference/max": 10.984610557556152, + "sampling/sampling_logp_difference/mean": 0.0203307643532753, + "step": 190 + }, + { + "clip_ratio/high_max": 1.7891727566166082e-05, + "clip_ratio/high_mean": 4.472931891541521e-06, + "clip_ratio/low_mean": 5.616715043288423e-05, + "clip_ratio/low_min": 7.80031223257538e-06, + "clip_ratio/region_mean": 6.064008221073891e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 6387.1875, + "completions/mean_terminated_length": 5895.54052734375, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.9110158830881119, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030851473566144705, + "learning_rate": 1e-05, + "loss": 0.1091, + "num_tokens": 148573782.0, + "reward": 0.40625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.003961040172725916, + "sampling/sampling_logp_difference/max": 5.531248569488525, + "sampling/sampling_logp_difference/mean": 0.018049638718366623, + "step": 191 + }, + { + "clip_ratio/high_max": 1.6994396901282016e-05, + "clip_ratio/high_mean": 5.400205964178895e-06, + "clip_ratio/low_mean": 3.274822392995702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8148429439388565e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7267.59375, + "completions/mean_terminated_length": 7195.81103515625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.9254888147115707, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020694085396826267, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 149521258.0, + "reward": 0.2734375, + "reward_std": 0.29719972610473633, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 7.411616934405174e-06, + "sampling/sampling_logp_difference/max": 11.812461853027344, + "sampling/sampling_logp_difference/mean": 0.01898832805454731, + "step": 192 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 149521258, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-192/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/README.md b/dapo_lora_plus_20251202_001141/checkpoint-256/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/latest b/dapo_lora_plus_20251202_001141/checkpoint-256/latest new file mode 100644 index 0000000000000000000000000000000000000000..b747f9725067064e241a7a3bed90583971af8ad1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/latest @@ -0,0 +1 @@ +global_step256 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea6a125e6f21394d9d572856f65dd117d5ebc999 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/trainer_state.json @@ -0,0 +1,7970 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23551057957681693, + "eval_steps": 500, + "global_step": 256, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + }, + { + "clip_ratio/high_max": 1.8439853647578275e-05, + "clip_ratio/high_mean": 4.609963411894569e-06, + "clip_ratio/low_mean": 5.708034223061986e-05, + "clip_ratio/low_min": 2.75287948170444e-06, + "clip_ratio/region_mean": 6.169030598357494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15081.0, + "completions/mean_length": 6565.359375, + "completions/mean_terminated_length": 6488.04736328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1013468354940414, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019073591101914644, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 97539453.0, + "reward": 0.2734375, + "reward_std": 0.307217001914978, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999555945396423, + "sampling/importance_sampling_ratio/min": 0.0006022047018632293, + "sampling/sampling_logp_difference/max": 7.414913177490234, + "sampling/sampling_logp_difference/mean": 0.02150837704539299, + "step": 129 + }, + { + "clip_ratio/high_max": 9.068485269381199e-06, + "clip_ratio/high_mean": 2.2671213173452998e-06, + "clip_ratio/low_mean": 1.9822365402433206e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.208948649240483e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 6779.6171875, + "completions/mean_terminated_length": 6703.9921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8940552547574043, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010163087863475084, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 98429036.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 3.464699460664633e-08, + "sampling/sampling_logp_difference/max": 17.178054809570312, + "sampling/sampling_logp_difference/mean": 0.018716152757406235, + "step": 130 + }, + { + "clip_ratio/high_max": 5.047242211730918e-06, + "clip_ratio/high_mean": 1.2618105529327295e-06, + "clip_ratio/low_mean": 2.9014110396019532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0275920835265424e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14549.0, + "completions/max_terminated_length": 14549.0, + "completions/mean_length": 5766.71875, + "completions/mean_terminated_length": 5766.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 1.0455922111868858, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002155766822397709, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 99184264.0, + "reward": 0.4140625, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253749847412, + "sampling/importance_sampling_ratio/min": 0.00010798005678225309, + "sampling/sampling_logp_difference/max": 9.133563995361328, + "sampling/sampling_logp_difference/mean": 0.020948775112628937, + "step": 131 + }, + { + "clip_ratio/high_max": 2.0882574972347356e-05, + "clip_ratio/high_mean": 6.505383225885453e-06, + "clip_ratio/low_mean": 4.496008500609605e-05, + "clip_ratio/low_min": 7.757854064038838e-06, + "clip_ratio/region_mean": 5.1465468231981504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14704.0, + "completions/mean_length": 6167.2421875, + "completions/mean_terminated_length": 6005.07177734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.9100174158811569, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021464223973453045, + "learning_rate": 1e-05, + "loss": -0.0279, + "num_tokens": 99996831.0, + "reward": 0.421875, + "reward_std": 0.3916535973548889, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240040779114, + "sampling/importance_sampling_ratio/min": 0.02249590866267681, + "sampling/sampling_logp_difference/max": 3.794421911239624, + "sampling/sampling_logp_difference/mean": 0.01866895705461502, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0998018473837874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0998018473837874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15738.0, + "completions/mean_length": 6242.9453125, + "completions/mean_terminated_length": 6163.09423828125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8624134212732315, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023277695290744305, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 100814112.0, + "reward": 0.3984375, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999959409236908, + "sampling/importance_sampling_ratio/min": 0.0002393616596236825, + "sampling/sampling_logp_difference/max": 8.33753490447998, + "sampling/sampling_logp_difference/mean": 0.0191188994795084, + "step": 133 + }, + { + "clip_ratio/high_max": 6.589872555196052e-06, + "clip_ratio/high_mean": 1.647468138799013e-06, + "clip_ratio/low_mean": 4.329304238126497e-05, + "clip_ratio/low_min": 3.5120251595799346e-06, + "clip_ratio/region_mean": 4.494051017900347e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5733.6875, + "completions/mean_terminated_length": 5478.080078125, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9628067463636398, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003547821193933487, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 101566264.0, + "reward": 0.3984375, + "reward_std": 0.36584997177124023, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0001282035664189607, + "sampling/sampling_logp_difference/max": 8.961891174316406, + "sampling/sampling_logp_difference/mean": 0.019646761938929558, + "step": 134 + }, + { + "clip_ratio/high_max": 1.7107527582993498e-05, + "clip_ratio/high_mean": 4.2768818957483745e-06, + "clip_ratio/low_mean": 3.014796902789385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.442485103732906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15848.0, + "completions/max_terminated_length": 15848.0, + "completions/mean_length": 5505.9375, + "completions/mean_terminated_length": 5505.9375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.8041045889258385, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024891747161746025, + "learning_rate": 1e-05, + "loss": 0.1406, + "num_tokens": 102291456.0, + "reward": 0.5, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 0.0014627616619691253, + "sampling/sampling_logp_difference/max": 6.527429103851318, + "sampling/sampling_logp_difference/mean": 0.01716250739991665, + "step": 135 + }, + { + "clip_ratio/high_max": 1.548903105685895e-05, + "clip_ratio/high_mean": 3.872257764214737e-06, + "clip_ratio/low_mean": 5.380711581892683e-05, + "clip_ratio/low_min": 4.5777483137499075e-06, + "clip_ratio/region_mean": 5.767937363998499e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16005.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 5003.0625, + "completions/mean_terminated_length": 5003.0625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.9115714654326439, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00220683915540576, + "learning_rate": 1e-05, + "loss": 0.1361, + "num_tokens": 102949824.0, + "reward": 0.4140625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 8.323705696966499e-05, + "sampling/sampling_logp_difference/max": 9.393817901611328, + "sampling/sampling_logp_difference/mean": 0.018076512962579727, + "step": 136 + }, + { + "clip_ratio/high_max": 2.181136096623959e-05, + "clip_ratio/high_mean": 5.4528402415598975e-06, + "clip_ratio/low_mean": 3.4416837252138066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986967681157694e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15658.0, + "completions/max_terminated_length": 15658.0, + "completions/mean_length": 4742.1328125, + "completions/mean_terminated_length": 4742.1328125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.9430246204137802, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003964806906878948, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 103580913.0, + "reward": 0.4609375, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 7.031940185697749e-05, + "sampling/sampling_logp_difference/max": 9.56246280670166, + "sampling/sampling_logp_difference/mean": 0.019651200622320175, + "step": 137 + }, + { + "clip_ratio/high_max": 4.07684046876966e-06, + "clip_ratio/high_mean": 1.019210117192415e-06, + "clip_ratio/low_mean": 3.8682398553646635e-05, + "clip_ratio/low_min": 8.189203072106466e-06, + "clip_ratio/region_mean": 3.970160832977854e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6574.171875, + "completions/mean_terminated_length": 6091.72119140625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8429529070854187, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002067410387098789, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 104447463.0, + "reward": 0.3125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997583627700806, + "sampling/importance_sampling_ratio/min": 0.00021258489869069308, + "sampling/sampling_logp_difference/max": 8.456169128417969, + "sampling/sampling_logp_difference/mean": 0.018853647634387016, + "step": 138 + }, + { + "clip_ratio/high_max": 1.9725823221961036e-05, + "clip_ratio/high_mean": 4.931455805490259e-06, + "clip_ratio/low_mean": 5.9263072444082354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.419452870431996e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15518.0, + "completions/max_terminated_length": 15518.0, + "completions/mean_length": 4581.5625, + "completions/mean_terminated_length": 4581.5625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.7094272822141647, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004292502999305725, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 105052287.0, + "reward": 0.625, + "reward_std": 0.3908300995826721, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.0019342642044648528, + "sampling/sampling_logp_difference/max": 6.24802827835083, + "sampling/sampling_logp_difference/mean": 0.016310662031173706, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0132298029930098e-05, + "clip_ratio/high_mean": 2.5330745074825245e-06, + "clip_ratio/low_mean": 4.6397121650443296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.893019581686531e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16097.0, + "completions/mean_length": 7066.4453125, + "completions/mean_terminated_length": 6918.5478515625, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8481669947504997, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015785128343850374, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 105977048.0, + "reward": 0.3515625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.00104097044095397, + "sampling/sampling_logp_difference/max": 6.8676018714904785, + "sampling/sampling_logp_difference/mean": 0.018304405733942986, + "step": 140 + }, + { + "clip_ratio/high_max": 1.6989023606583942e-05, + "clip_ratio/high_mean": 4.2472559016459854e-06, + "clip_ratio/low_mean": 2.3075059743860038e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7322315418132348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16104.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 6230.5234375, + "completions/mean_terminated_length": 6230.5234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9658062160015106, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002542720176279545, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 106793187.0, + "reward": 0.3203125, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.0002781494113150984, + "sampling/sampling_logp_difference/max": 8.187352180480957, + "sampling/sampling_logp_difference/mean": 0.019391046836972237, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7597974508353218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7597974508353218e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14216.0, + "completions/mean_length": 5690.5546875, + "completions/mean_terminated_length": 5606.3544921875, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 1.0098655670881271, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001451602904126048, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 107539874.0, + "reward": 0.4296875, + "reward_std": 0.23304283618927002, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999307990074158, + "sampling/importance_sampling_ratio/min": 5.640022671116185e-09, + "sampling/sampling_logp_difference/max": 18.993377685546875, + "sampling/sampling_logp_difference/mean": 0.018607191741466522, + "step": 142 + }, + { + "clip_ratio/high_max": 1.2800467629858758e-05, + "clip_ratio/high_mean": 4.19954119479371e-06, + "clip_ratio/low_mean": 2.350350996493944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.770305115973315e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15791.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5471.1328125, + "completions/mean_terminated_length": 5471.1328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0413162112236023, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023549250327050686, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 108260091.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999832510948181, + "sampling/importance_sampling_ratio/min": 0.0011709182290360332, + "sampling/sampling_logp_difference/max": 6.749967098236084, + "sampling/sampling_logp_difference/mean": 0.020427243784070015, + "step": 143 + }, + { + "clip_ratio/high_max": 2.1983064925734652e-05, + "clip_ratio/high_mean": 5.495766231433663e-06, + "clip_ratio/low_mean": 4.361141452591255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9107180757346214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6211.7421875, + "completions/mean_terminated_length": 6050.2783203125, + "completions/min_length": 622.0, + "completions/min_terminated_length": 622.0, + "entropy": 0.9706784337759018, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017527056625112891, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 109073890.0, + "reward": 0.421875, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.002898645820096135, + "sampling/sampling_logp_difference/max": 5.843511581420898, + "sampling/sampling_logp_difference/mean": 0.018898162990808487, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.208964992358233e-05, + "clip_ratio/low_min": 3.9168990042526275e-06, + "clip_ratio/region_mean": 4.208964992358233e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14880.0, + "completions/mean_length": 6007.8984375, + "completions/mean_terminated_length": 5926.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.1967609524726868, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007858420140109956, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 109861813.0, + "reward": 0.296875, + "reward_std": 0.23486506938934326, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 3.294382011631569e-08, + "sampling/sampling_logp_difference/max": 17.22846221923828, + "sampling/sampling_logp_difference/mean": 0.021845955401659012, + "step": 145 + }, + { + "clip_ratio/high_max": 4.5118208618077915e-06, + "clip_ratio/high_mean": 1.1279552154519479e-06, + "clip_ratio/low_mean": 3.749712686840212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8625082197540905e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6800.9921875, + "completions/mean_terminated_length": 6725.53564453125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.0437887012958527, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029428249690681696, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 110756572.0, + "reward": 0.265625, + "reward_std": 0.3248382806777954, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.0006329434108920395, + "sampling/sampling_logp_difference/max": 7.365129470825195, + "sampling/sampling_logp_difference/mean": 0.02010120078921318, + "step": 146 + }, + { + "clip_ratio/high_max": 1.427700522071973e-05, + "clip_ratio/high_mean": 3.5692513051799324e-06, + "clip_ratio/low_mean": 4.964020990883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320946092979284e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6309.4453125, + "completions/mean_terminated_length": 6230.1181640625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.9768906533718109, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002088683657348156, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 111585493.0, + "reward": 0.375, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.009723234921693802, + "sampling/sampling_logp_difference/max": 4.633236885070801, + "sampling/sampling_logp_difference/mean": 0.020927833393216133, + "step": 147 + }, + { + "clip_ratio/high_max": 5.4841398196003865e-06, + "clip_ratio/high_mean": 1.3710349549000966e-06, + "clip_ratio/low_mean": 5.122006064084417e-05, + "clip_ratio/low_min": 3.785125954891555e-06, + "clip_ratio/region_mean": 5.25910957094311e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15209.0, + "completions/mean_length": 6221.859375, + "completions/mean_terminated_length": 6060.5556640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.9212924689054489, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002406956860795617, + "learning_rate": 1e-05, + "loss": 0.1051, + "num_tokens": 112400363.0, + "reward": 0.40625, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05, + "sampling/sampling_logp_difference/max": 9.74976634979248, + "sampling/sampling_logp_difference/mean": 0.018652018159627914, + "step": 148 + }, + { + "clip_ratio/high_max": 1.4568151755156578e-05, + "clip_ratio/high_mean": 3.6420379387891444e-06, + "clip_ratio/low_mean": 3.999794398623635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3639981413434725e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14997.0, + "completions/mean_length": 6942.8203125, + "completions/mean_terminated_length": 6716.232421875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.949538916349411, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022962254006415606, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 113308748.0, + "reward": 0.375, + "reward_std": 0.3329663872718811, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.00048810525913722813, + "sampling/sampling_logp_difference/max": 7.624979496002197, + "sampling/sampling_logp_difference/mean": 0.01939917355775833, + "step": 149 + }, + { + "clip_ratio/high_max": 8.786732450971613e-06, + "clip_ratio/high_mean": 2.196683112742903e-06, + "clip_ratio/low_mean": 5.562954720517155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7826231113722315e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15182.0, + "completions/mean_length": 6783.1796875, + "completions/mean_terminated_length": 6552.76025390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9774708449840546, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020560629200190306, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 114196235.0, + "reward": 0.34375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998990297317505, + "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07, + "sampling/sampling_logp_difference/max": 15.211536407470703, + "sampling/sampling_logp_difference/mean": 0.019691556692123413, + "step": 150 + }, + { + "clip_ratio/high_max": 1.799483243303257e-05, + "clip_ratio/high_mean": 4.498708108258143e-06, + "clip_ratio/low_mean": 2.6389980291696702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0888688343111426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15549.0, + "completions/mean_length": 5568.15625, + "completions/mean_terminated_length": 5396.4765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.9303529411554337, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022214846685528755, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 114928047.0, + "reward": 0.234375, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05, + "sampling/sampling_logp_difference/max": 10.749968528747559, + "sampling/sampling_logp_difference/mean": 0.01938418298959732, + "step": 151 + }, + { + "clip_ratio/high_max": 1.1957493370573502e-05, + "clip_ratio/high_mean": 2.9893733426433755e-06, + "clip_ratio/low_mean": 5.885063319510664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.184000585562899e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15340.0, + "completions/max_terminated_length": 15340.0, + "completions/mean_length": 6086.578125, + "completions/mean_terminated_length": 6086.578125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.9131873697042465, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002448044717311859, + "learning_rate": 1e-05, + "loss": 0.0599, + "num_tokens": 115725657.0, + "reward": 0.40625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999779462814331, + "sampling/importance_sampling_ratio/min": 0.02929726243019104, + "sampling/sampling_logp_difference/max": 3.530261278152466, + "sampling/sampling_logp_difference/mean": 0.019298439845442772, + "step": 152 + }, + { + "clip_ratio/high_max": 1.3385357760853367e-05, + "clip_ratio/high_mean": 3.3463394402133417e-06, + "clip_ratio/low_mean": 5.717015119444113e-05, + "clip_ratio/low_min": 3.4328400033700746e-06, + "clip_ratio/region_mean": 6.0516490520967636e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 6442.5390625, + "completions/mean_terminated_length": 6203.9443359375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.8959419652819633, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002013204852119088, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 116571478.0, + "reward": 0.2734375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06, + "sampling/sampling_logp_difference/max": 13.778777122497559, + "sampling/sampling_logp_difference/mean": 0.01925014518201351, + "step": 153 + }, + { + "clip_ratio/high_max": 9.34224021875707e-06, + "clip_ratio/high_mean": 3.136903728773177e-06, + "clip_ratio/low_mean": 2.9738095065567904e-05, + "clip_ratio/low_min": 3.7240065466903616e-06, + "clip_ratio/region_mean": 3.2874999135401595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6633.5703125, + "completions/mean_terminated_length": 6319.0400390625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0223619118332863, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024523327592760324, + "learning_rate": 1e-05, + "loss": 0.056, + "num_tokens": 117440743.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05, + "sampling/sampling_logp_difference/max": 10.413415908813477, + "sampling/sampling_logp_difference/mean": 0.02061290666460991, + "step": 154 + }, + { + "clip_ratio/high_max": 1.4537483366439119e-05, + "clip_ratio/high_mean": 3.6343708416097797e-06, + "clip_ratio/low_mean": 3.954866042477079e-05, + "clip_ratio/low_min": 9.874949228105834e-06, + "clip_ratio/region_mean": 4.318303126638057e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15919.0, + "completions/mean_length": 7183.0, + "completions/mean_terminated_length": 6886.193359375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.9815369099378586, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018688985146582127, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 118380687.0, + "reward": 0.2890625, + "reward_std": 0.2498900145292282, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039173126221, + "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05, + "sampling/sampling_logp_difference/max": 11.187394142150879, + "sampling/sampling_logp_difference/mean": 0.019792160019278526, + "step": 155 + }, + { + "clip_ratio/high_max": 7.165636361605721e-06, + "clip_ratio/high_mean": 1.7914090904014301e-06, + "clip_ratio/low_mean": 4.9011068711024564e-05, + "clip_ratio/low_min": 1.0991705721608014e-05, + "clip_ratio/region_mean": 5.0802477687739156e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6324.640625, + "completions/mean_terminated_length": 5829.91748046875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.852975606918335, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002005894435569644, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 119207089.0, + "reward": 0.3984375, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 5.788659223071591e-07, + "sampling/sampling_logp_difference/max": 14.362195014953613, + "sampling/sampling_logp_difference/mean": 0.01853565312922001, + "step": 156 + }, + { + "clip_ratio/high_max": 7.795394822096569e-06, + "clip_ratio/high_mean": 1.948848705524142e-06, + "clip_ratio/low_mean": 3.834237736555224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0291225786859286e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 5723.421875, + "completions/mean_terminated_length": 5290.06494140625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8744911625981331, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002577397273853421, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 119961895.0, + "reward": 0.390625, + "reward_std": 0.34321609139442444, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.07882421463727951, + "sampling/sampling_logp_difference/max": 2.5405349731445312, + "sampling/sampling_logp_difference/mean": 0.018341556191444397, + "step": 157 + }, + { + "clip_ratio/high_max": 9.214097190124448e-06, + "clip_ratio/high_mean": 2.303524297531112e-06, + "clip_ratio/low_mean": 2.636873176697918e-05, + "clip_ratio/low_min": 2.9339967113628518e-06, + "clip_ratio/region_mean": 2.8672255837136618e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7886.015625, + "completions/mean_terminated_length": 7682.064453125, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.9391767829656601, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002552987542003393, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 120990289.0, + "reward": 0.328125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.000899312668479979, + "sampling/sampling_logp_difference/max": 7.013879776000977, + "sampling/sampling_logp_difference/mean": 0.02049873024225235, + "step": 158 + }, + { + "clip_ratio/high_max": 3.406416203688423e-05, + "clip_ratio/high_mean": 9.72330332160709e-06, + "clip_ratio/low_mean": 3.168332909808669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140663151019908e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6173.1640625, + "completions/mean_terminated_length": 6011.087890625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.9148785546422005, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002678362652659416, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 121797958.0, + "reward": 0.4140625, + "reward_std": 0.3608373999595642, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.002013920107856393, + "sampling/sampling_logp_difference/max": 6.207672119140625, + "sampling/sampling_logp_difference/mean": 0.018977735191583633, + "step": 159 + }, + { + "clip_ratio/high_max": 1.8476588593330234e-05, + "clip_ratio/high_mean": 4.6191471483325586e-06, + "clip_ratio/low_mean": 4.459614581264759e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9215293188353826e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 6594.21875, + "completions/mean_terminated_length": 6196.259765625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9486038386821747, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033711253199726343, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 122661170.0, + "reward": 0.3828125, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981356620789, + "sampling/importance_sampling_ratio/min": 0.0002968419576063752, + "sampling/sampling_logp_difference/max": 8.122310638427734, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 160 + }, + { + "clip_ratio/high_max": 7.97335997049231e-06, + "clip_ratio/high_mean": 2.7343705824023345e-06, + "clip_ratio/low_mean": 5.420079878604156e-05, + "clip_ratio/low_min": 4.594068286678521e-06, + "clip_ratio/region_mean": 5.693517005056492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 6533.9453125, + "completions/mean_terminated_length": 6377.595703125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9986584335565567, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017857529455795884, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 123518107.0, + "reward": 0.34375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998549818992615, + "sampling/importance_sampling_ratio/min": 9.012701411847956e-06, + "sampling/sampling_logp_difference/max": 11.616875648498535, + "sampling/sampling_logp_difference/mean": 0.02010391652584076, + "step": 161 + }, + { + "clip_ratio/high_max": 4.470512521947967e-06, + "clip_ratio/high_mean": 1.1176281304869917e-06, + "clip_ratio/low_mean": 3.5141094485879876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.625872295742738e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13212.0, + "completions/mean_length": 5742.21875, + "completions/mean_terminated_length": 5658.42529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0379670709371567, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018227624241262674, + "learning_rate": 1e-05, + "loss": -0.0237, + "num_tokens": 124279031.0, + "reward": 0.21875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.0020977305248379707, + "sampling/sampling_logp_difference/max": 6.16689920425415, + "sampling/sampling_logp_difference/mean": 0.019987668842077255, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0003542683989508e-05, + "clip_ratio/high_mean": 3.21091931709816e-06, + "clip_ratio/low_mean": 5.731009014198207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0521009800140746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7584.703125, + "completions/mean_terminated_length": 7515.41748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.953459307551384, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002219022251665592, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 125270761.0, + "reward": 0.359375, + "reward_std": 0.37033066153526306, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880790710449, + "sampling/importance_sampling_ratio/min": 0.0024849213659763336, + "sampling/sampling_logp_difference/max": 5.997514247894287, + "sampling/sampling_logp_difference/mean": 0.020291510969400406, + "step": 163 + }, + { + "clip_ratio/high_max": 7.734669452474918e-06, + "clip_ratio/high_mean": 1.9336673631187296e-06, + "clip_ratio/low_mean": 3.1135301298945706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3068968605221016e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 4714.671875, + "completions/mean_terminated_length": 4622.78759765625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.018719919025898, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014189074281603098, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 125895279.0, + "reward": 0.3984375, + "reward_std": 0.28383445739746094, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479651451111, + "sampling/importance_sampling_ratio/min": 4.017410901724361e-07, + "sampling/sampling_logp_difference/max": 14.727458000183105, + "sampling/sampling_logp_difference/mean": 0.018739396706223488, + "step": 164 + }, + { + "clip_ratio/high_max": 1.0069575182569679e-05, + "clip_ratio/high_mean": 2.5173937956424197e-06, + "clip_ratio/low_mean": 3.824179225375701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0759185367278405e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15913.0, + "completions/mean_length": 6316.140625, + "completions/mean_terminated_length": 6074.51220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.9325072392821312, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001702460227534175, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 126722881.0, + "reward": 0.4609375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999539852142334, + "sampling/importance_sampling_ratio/min": 0.0012551364488899708, + "sampling/sampling_logp_difference/max": 6.680510997772217, + "sampling/sampling_logp_difference/mean": 0.01929408684372902, + "step": 165 + }, + { + "clip_ratio/high_max": 6.873041002108948e-06, + "clip_ratio/high_mean": 1.718260250527237e-06, + "clip_ratio/low_mean": 3.119859468370123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.291685527528898e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15832.0, + "completions/mean_length": 4687.140625, + "completions/mean_terminated_length": 4595.03955078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 1.0886607319116592, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032931750174611807, + "learning_rate": 1e-05, + "loss": 0.0078, + "num_tokens": 127341715.0, + "reward": 0.28125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.0019364450126886368, + "sampling/sampling_logp_difference/max": 6.246901512145996, + "sampling/sampling_logp_difference/mean": 0.020621225237846375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.773085250533768e-05, + "clip_ratio/high_mean": 4.43271312633442e-06, + "clip_ratio/low_mean": 4.30743207289197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7507033741567284e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14125.0, + "completions/mean_length": 5705.515625, + "completions/mean_terminated_length": 5449.232421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0523068830370903, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031696646474301815, + "learning_rate": 1e-05, + "loss": -0.0414, + "num_tokens": 128093597.0, + "reward": 0.1953125, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619126319885, + "sampling/importance_sampling_ratio/min": 3.197810656274669e-05, + "sampling/sampling_logp_difference/max": 10.350459098815918, + "sampling/sampling_logp_difference/mean": 0.021961934864521027, + "step": 167 + }, + { + "clip_ratio/high_max": 1.885905066956184e-05, + "clip_ratio/high_mean": 4.71476266739046e-06, + "clip_ratio/low_mean": 5.0530389898995054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.524515336219338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15958.0, + "completions/mean_length": 6214.4921875, + "completions/mean_terminated_length": 6053.07177734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.9371421113610268, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023704832419753075, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 128906948.0, + "reward": 0.40625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.0003354824730195105, + "sampling/sampling_logp_difference/max": 7.999940872192383, + "sampling/sampling_logp_difference/mean": 0.01882763020694256, + "step": 168 + }, + { + "clip_ratio/high_max": 3.042072216885572e-05, + "clip_ratio/high_mean": 7.60518054221393e-06, + "clip_ratio/low_mean": 4.5897569179942366e-05, + "clip_ratio/low_min": 8.727477506909054e-06, + "clip_ratio/region_mean": 5.3502750233747065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7127.0703125, + "completions/mean_terminated_length": 7054.18115234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.9854387491941452, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003370177699252963, + "learning_rate": 1e-05, + "loss": 0.1197, + "num_tokens": 129839813.0, + "reward": 0.359375, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999907910823822, + "sampling/importance_sampling_ratio/min": 1.077816432371037e-05, + "sampling/sampling_logp_difference/max": 11.43798828125, + "sampling/sampling_logp_difference/mean": 0.019736800342798233, + "step": 169 + }, + { + "clip_ratio/high_max": 2.1401074718596647e-05, + "clip_ratio/high_mean": 6.243764005375851e-06, + "clip_ratio/low_mean": 3.2797592325550795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.904135610355297e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 6566.2890625, + "completions/mean_terminated_length": 6330.6640625, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.7978609576821327, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026055986527353525, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 130698370.0, + "reward": 0.5, + "reward_std": 0.36295419931411743, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.00031152591691352427, + "sampling/sampling_logp_difference/max": 8.074028015136719, + "sampling/sampling_logp_difference/mean": 0.01787097379565239, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0564424403346493e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0564424403346493e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15576.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7186.2890625, + "completions/mean_terminated_length": 7186.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.0232757329940796, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0023866184055805206, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 131637439.0, + "reward": 0.2734375, + "reward_std": 0.2059282809495926, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207258224487, + "sampling/importance_sampling_ratio/min": 0.0007378471200354397, + "sampling/sampling_logp_difference/max": 7.211773872375488, + "sampling/sampling_logp_difference/mean": 0.02137116715312004, + "step": 171 + }, + { + "clip_ratio/high_max": 4.037900725961663e-05, + "clip_ratio/high_mean": 1.0094751814904157e-05, + "clip_ratio/low_mean": 5.8380828136250784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.847557995115494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 5591.5703125, + "completions/mean_terminated_length": 5420.26220703125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9335208311676979, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003491115989163518, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 132371816.0, + "reward": 0.5, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891459941864, + "sampling/importance_sampling_ratio/min": 0.00012356207298580557, + "sampling/sampling_logp_difference/max": 8.998766899108887, + "sampling/sampling_logp_difference/mean": 0.018760837614536285, + "step": 172 + }, + { + "clip_ratio/high_max": 2.8378776733006816e-06, + "clip_ratio/high_mean": 7.094694183251704e-07, + "clip_ratio/low_mean": 4.4085751369493664e-05, + "clip_ratio/low_min": 6.7955093072669115e-06, + "clip_ratio/region_mean": 4.4795220674132e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7152.3828125, + "completions/mean_terminated_length": 6930.82421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.1329835206270218, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002830669516697526, + "learning_rate": 1e-05, + "loss": 0.0526, + "num_tokens": 133307297.0, + "reward": 0.28125, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.00028047082014381886, + "sampling/sampling_logp_difference/max": 8.179040908813477, + "sampling/sampling_logp_difference/mean": 0.021548541262745857, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0150829439226072e-05, + "clip_ratio/high_mean": 2.537707359806518e-06, + "clip_ratio/low_mean": 3.4009618616437365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654732597624388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 7263.453125, + "completions/mean_terminated_length": 7118.68310546875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.092760555446148, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027821618132293224, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 134260107.0, + "reward": 0.3203125, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 7.832317351130769e-05, + "sampling/sampling_logp_difference/max": 9.454667091369629, + "sampling/sampling_logp_difference/mean": 0.022098438814282417, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0561876024439698e-05, + "clip_ratio/high_mean": 2.6404690061099245e-06, + "clip_ratio/low_mean": 1.6864279416495265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9504748649978865e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15388.0, + "completions/mean_length": 7088.8125, + "completions/mean_terminated_length": 6710.958984375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 1.0669445469975471, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007076738984324038, + "learning_rate": 1e-05, + "loss": -0.0197, + "num_tokens": 135186139.0, + "reward": 0.328125, + "reward_std": 0.20593319833278656, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 3.084653872065246e-05, + "sampling/sampling_logp_difference/max": 10.386486053466797, + "sampling/sampling_logp_difference/mean": 0.020075790584087372, + "step": 175 + }, + { + "clip_ratio/high_max": 7.095016371749807e-06, + "clip_ratio/high_mean": 1.7737540929374518e-06, + "clip_ratio/low_mean": 2.7592465016823553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.936621888238733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15626.0, + "completions/max_terminated_length": 15626.0, + "completions/mean_length": 5352.734375, + "completions/mean_terminated_length": 5352.734375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 1.0387161895632744, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0022445612121373415, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 135888929.0, + "reward": 0.4765625, + "reward_std": 0.399257630109787, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 0.00032565294532105327, + "sampling/sampling_logp_difference/max": 8.029678344726562, + "sampling/sampling_logp_difference/mean": 0.02010166086256504, + "step": 176 + }, + { + "clip_ratio/high_max": 1.5100852124305675e-05, + "clip_ratio/high_mean": 4.426987970873597e-06, + "clip_ratio/low_mean": 2.7625993425317574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2052981168817496e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 7758.90625, + "completions/mean_terminated_length": 7408.29248046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 1.0648984238505363, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022021254990249872, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 136901941.0, + "reward": 0.3671875, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858140945435, + "sampling/importance_sampling_ratio/min": 2.2461865967216e-07, + "sampling/sampling_logp_difference/max": 15.30886173248291, + "sampling/sampling_logp_difference/mean": 0.021426808089017868, + "step": 177 + }, + { + "clip_ratio/high_max": 2.5346608254039893e-05, + "clip_ratio/high_mean": 7.4063813144675805e-06, + "clip_ratio/low_mean": 2.2069365058996482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9475746259777225e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 7036.953125, + "completions/mean_terminated_length": 6496.21484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9684997871518135, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013461806811392307, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 137824623.0, + "reward": 0.34375, + "reward_std": 0.2546031177043915, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 5.834372132085264e-05, + "sampling/sampling_logp_difference/max": 9.74915885925293, + "sampling/sampling_logp_difference/mean": 0.020304443314671516, + "step": 178 + }, + { + "clip_ratio/high_max": 1.3147734080121154e-05, + "clip_ratio/high_mean": 3.2869335200302885e-06, + "clip_ratio/low_mean": 4.841489999307669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.170183294467279e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6114.1875, + "completions/mean_terminated_length": 5951.1748046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.943072073161602, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132438588887453, + "learning_rate": 1e-05, + "loss": 0.0943, + "num_tokens": 138625247.0, + "reward": 0.40625, + "reward_std": 0.321650892496109, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999298453330994, + "sampling/importance_sampling_ratio/min": 0.0017275095451623201, + "sampling/sampling_logp_difference/max": 6.361074447631836, + "sampling/sampling_logp_difference/mean": 0.020084267482161522, + "step": 179 + }, + { + "clip_ratio/high_max": 1.7873157958092634e-05, + "clip_ratio/high_mean": 4.468289489523158e-06, + "clip_ratio/low_mean": 3.5252990301160025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9721279790683184e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15050.0, + "completions/mean_length": 7618.875, + "completions/mean_terminated_length": 7034.53369140625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.9142575263977051, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026741649489849806, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 139619287.0, + "reward": 0.2890625, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.005949751473963261, + "sampling/sampling_logp_difference/max": 5.124405860900879, + "sampling/sampling_logp_difference/mean": 0.020061582326889038, + "step": 180 + }, + { + "clip_ratio/high_max": 1.0512151675357018e-05, + "clip_ratio/high_mean": 2.6280379188392544e-06, + "clip_ratio/low_mean": 4.5301517502593924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.792955542143318e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16106.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 5333.875, + "completions/mean_terminated_length": 5333.875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "entropy": 0.8107482865452766, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027016003150492907, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 140318935.0, + "reward": 0.5703125, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.006856904830783606, + "sampling/sampling_logp_difference/max": 4.982499122619629, + "sampling/sampling_logp_difference/mean": 0.017069874331355095, + "step": 181 + }, + { + "clip_ratio/high_max": 1.85085939392593e-05, + "clip_ratio/high_mean": 5.24943533264377e-06, + "clip_ratio/low_mean": 5.6120721524166584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.137015702734061e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7443.3046875, + "completions/mean_terminated_length": 7154.89501953125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 0.9224414080381393, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002655779244378209, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 141293534.0, + "reward": 0.234375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.00042018835665658116, + "sampling/sampling_logp_difference/max": 7.774807453155518, + "sampling/sampling_logp_difference/mean": 0.02006504125893116, + "step": 182 + }, + { + "clip_ratio/high_max": 1.494229445597739e-05, + "clip_ratio/high_mean": 3.7355736139943474e-06, + "clip_ratio/low_mean": 2.2748562741981004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6484136355975352e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 5646.6875, + "completions/mean_terminated_length": 5646.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8945339694619179, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016281780553981662, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 142037438.0, + "reward": 0.46875, + "reward_std": 0.17912296950817108, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.0005717006279155612, + "sampling/sampling_logp_difference/max": 7.46689510345459, + "sampling/sampling_logp_difference/mean": 0.019336247816681862, + "step": 183 + }, + { + "clip_ratio/high_max": 3.335990868436056e-05, + "clip_ratio/high_mean": 8.33997717109014e-06, + "clip_ratio/low_mean": 3.5050728683927446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339070608239126e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14142.0, + "completions/mean_length": 6384.640625, + "completions/mean_terminated_length": 5892.86865234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.840093269944191, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002166559686884284, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 142873848.0, + "reward": 0.4765625, + "reward_std": 0.35506346821784973, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 4.785555574926548e-06, + "sampling/sampling_logp_difference/max": 12.249908447265625, + "sampling/sampling_logp_difference/mean": 0.018109092488884926, + "step": 184 + }, + { + "clip_ratio/high_max": 1.541105484648142e-05, + "clip_ratio/high_mean": 3.852763711620355e-06, + "clip_ratio/low_mean": 4.0552770769863855e-05, + "clip_ratio/low_min": 7.133888630050933e-06, + "clip_ratio/region_mean": 4.440553459517105e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14828.0, + "completions/mean_length": 5775.0, + "completions/mean_terminated_length": 5691.46435546875, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "entropy": 0.8915362879633904, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021932912059128284, + "learning_rate": 1e-05, + "loss": -0.0086, + "num_tokens": 143636152.0, + "reward": 0.4375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 9.714113069492214e-09, + "sampling/sampling_logp_difference/max": 18.44968605041504, + "sampling/sampling_logp_difference/mean": 0.019278086721897125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7509142171311396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7509142171311396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6181.640625, + "completions/mean_terminated_length": 6019.69873046875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 1.0544511675834656, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022947140969336033, + "learning_rate": 1e-05, + "loss": 0.0242, + "num_tokens": 144447370.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147653579712, + "sampling/importance_sampling_ratio/min": 7.419757253046555e-08, + "sampling/sampling_logp_difference/max": 16.416534423828125, + "sampling/sampling_logp_difference/mean": 0.02050788700580597, + "step": 186 + }, + { + "clip_ratio/high_max": 1.5700999938417226e-05, + "clip_ratio/high_mean": 3.9252499846043065e-06, + "clip_ratio/low_mean": 2.4595847037289786e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8521096965050674e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 6542.3046875, + "completions/mean_terminated_length": 6306.1044921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.933225467801094, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034910975955426693, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 145303505.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.007213745731860399, + "sampling/sampling_logp_difference/max": 4.931766986846924, + "sampling/sampling_logp_difference/mean": 0.020022759214043617, + "step": 187 + }, + { + "clip_ratio/high_max": 6.0999414017715026e-06, + "clip_ratio/high_mean": 1.5249853504428756e-06, + "clip_ratio/low_mean": 2.61421698724007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7667155109156738e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 5889.4765625, + "completions/mean_terminated_length": 5637.6083984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.9649673849344254, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024078311398625374, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 146082198.0, + "reward": 0.3359375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999341368675232, + "sampling/importance_sampling_ratio/min": 0.0008680344326421618, + "sampling/sampling_logp_difference/max": 7.04927921295166, + "sampling/sampling_logp_difference/mean": 0.02060198038816452, + "step": 188 + }, + { + "clip_ratio/high_max": 7.789618393871933e-06, + "clip_ratio/high_mean": 1.9474045984679833e-06, + "clip_ratio/low_mean": 3.6395756637830345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.834316100892465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5349.2421875, + "completions/mean_terminated_length": 5084.408203125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8402756005525589, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021191861014813185, + "learning_rate": 1e-05, + "loss": 0.1275, + "num_tokens": 146786245.0, + "reward": 0.4765625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 3.763807762879878e-05, + "sampling/sampling_logp_difference/max": 10.187494277954102, + "sampling/sampling_logp_difference/mean": 0.017112664878368378, + "step": 189 + }, + { + "clip_ratio/high_max": 1.2461773394534248e-05, + "clip_ratio/high_mean": 3.115443348633562e-06, + "clip_ratio/low_mean": 5.095924211673264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4074685294835945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 7272.3203125, + "completions/mean_terminated_length": 7053.64013671875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.9627499282360077, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022120666690170765, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 147737086.0, + "reward": 0.2890625, + "reward_std": 0.27304792404174805, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05, + "sampling/sampling_logp_difference/max": 10.984610557556152, + "sampling/sampling_logp_difference/mean": 0.0203307643532753, + "step": 190 + }, + { + "clip_ratio/high_max": 1.7891727566166082e-05, + "clip_ratio/high_mean": 4.472931891541521e-06, + "clip_ratio/low_mean": 5.616715043288423e-05, + "clip_ratio/low_min": 7.80031223257538e-06, + "clip_ratio/region_mean": 6.064008221073891e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 6387.1875, + "completions/mean_terminated_length": 5895.54052734375, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.9110158830881119, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030851473566144705, + "learning_rate": 1e-05, + "loss": 0.1091, + "num_tokens": 148573782.0, + "reward": 0.40625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.003961040172725916, + "sampling/sampling_logp_difference/max": 5.531248569488525, + "sampling/sampling_logp_difference/mean": 0.018049638718366623, + "step": 191 + }, + { + "clip_ratio/high_max": 1.6994396901282016e-05, + "clip_ratio/high_mean": 5.400205964178895e-06, + "clip_ratio/low_mean": 3.274822392995702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8148429439388565e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7267.59375, + "completions/mean_terminated_length": 7195.81103515625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.9254888147115707, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020694085396826267, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 149521258.0, + "reward": 0.2734375, + "reward_std": 0.29719972610473633, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 7.411616934405174e-06, + "sampling/sampling_logp_difference/max": 11.812461853027344, + "sampling/sampling_logp_difference/mean": 0.01898832805454731, + "step": 192 + }, + { + "clip_ratio/high_max": 4.10414668294834e-06, + "clip_ratio/high_mean": 1.026036670737085e-06, + "clip_ratio/low_mean": 4.7441100377909606e-05, + "clip_ratio/low_min": 4.552241534838686e-06, + "clip_ratio/region_mean": 4.8467136821273016e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16076.0, + "completions/mean_length": 7100.1953125, + "completions/mean_terminated_length": 6952.83349609375, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.8455610796809196, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003085972974076867, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 150447923.0, + "reward": 0.25, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.0011708807433024049, + "sampling/sampling_logp_difference/max": 6.749999046325684, + "sampling/sampling_logp_difference/mean": 0.01974140852689743, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6514521121280268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6514521121280268e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15535.0, + "completions/mean_length": 6626.4296875, + "completions/mean_terminated_length": 6549.5986328125, + "completions/min_length": 1746.0, + "completions/min_terminated_length": 1746.0, + "entropy": 1.0323699787259102, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003505800850689411, + "learning_rate": 1e-05, + "loss": 0.0885, + "num_tokens": 151313834.0, + "reward": 0.390625, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381303787231, + "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05, + "sampling/sampling_logp_difference/max": 10.479642868041992, + "sampling/sampling_logp_difference/mean": 0.021082937717437744, + "step": 194 + }, + { + "clip_ratio/high_max": 2.006086378969485e-05, + "clip_ratio/high_mean": 5.890002398700744e-06, + "clip_ratio/low_mean": 3.503898199141986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.092898473118112e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 7093.109375, + "completions/mean_terminated_length": 6870.12841796875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 1.0206764563918114, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002495395252481103, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 152238192.0, + "reward": 0.2890625, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 9.536534344078973e-05, + "sampling/sampling_logp_difference/max": 9.257795333862305, + "sampling/sampling_logp_difference/mean": 0.020610272884368896, + "step": 195 + }, + { + "clip_ratio/high_max": 3.2352409107261337e-06, + "clip_ratio/high_mean": 8.088102276815334e-07, + "clip_ratio/low_mean": 4.056704699451075e-05, + "clip_ratio/low_min": 1.1648833606159315e-05, + "clip_ratio/region_mean": 4.1375856994818605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14191.0, + "completions/mean_length": 6795.71875, + "completions/mean_terminated_length": 6486.4189453125, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "entropy": 0.8927837759256363, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014066790463402867, + "learning_rate": 1e-05, + "loss": -0.0031, + "num_tokens": 153131828.0, + "reward": 0.3359375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 5.093755135021638e-06, + "sampling/sampling_logp_difference/max": 12.187495231628418, + "sampling/sampling_logp_difference/mean": 0.01874586008489132, + "step": 196 + }, + { + "clip_ratio/high_max": 1.5244630048982799e-05, + "clip_ratio/high_mean": 3.8111575122456998e-06, + "clip_ratio/low_mean": 3.655197178886738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.03631290737394e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15831.0, + "completions/mean_length": 7075.1015625, + "completions/mean_terminated_length": 6617.28662109375, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.8989318311214447, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017937121447175741, + "learning_rate": 1e-05, + "loss": 0.0359, + "num_tokens": 154057097.0, + "reward": 0.3984375, + "reward_std": 0.23068872094154358, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.00021659507183358073, + "sampling/sampling_logp_difference/max": 8.437480926513672, + "sampling/sampling_logp_difference/mean": 0.01890135183930397, + "step": 197 + }, + { + "clip_ratio/high_max": 1.4074375030759256e-05, + "clip_ratio/high_mean": 4.977033995601232e-06, + "clip_ratio/low_mean": 3.2670792506905855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.764782627513341e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14100.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6743.41455078125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.8758384585380554, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410576842725277, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 154988585.0, + "reward": 0.3984375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.003589102067053318, + "sampling/sampling_logp_difference/max": 5.629853248596191, + "sampling/sampling_logp_difference/mean": 0.018400676548480988, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.977112736994968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.977112736994968e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6590.6796875, + "completions/mean_terminated_length": 6513.56689453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9243742749094963, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003304310142993927, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 155851000.0, + "reward": 0.3984375, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579787254333, + "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06, + "sampling/sampling_logp_difference/max": 13.576997756958008, + "sampling/sampling_logp_difference/mean": 0.01959652081131935, + "step": 199 + }, + { + "clip_ratio/high_max": 1.1435367014200892e-05, + "clip_ratio/high_mean": 2.858841753550223e-06, + "clip_ratio/low_mean": 4.7742656533955596e-05, + "clip_ratio/low_min": 8.646529749967158e-06, + "clip_ratio/region_mean": 5.0601498060132144e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6999.484375, + "completions/mean_terminated_length": 6696.7578125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.843244343996048, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023830258287489414, + "learning_rate": 1e-05, + "loss": 0.1142, + "num_tokens": 156766782.0, + "reward": 0.359375, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635053634644, + "sampling/importance_sampling_ratio/min": 0.00014761318743694574, + "sampling/sampling_logp_difference/max": 8.820915222167969, + "sampling/sampling_logp_difference/mean": 0.018434934318065643, + "step": 200 + }, + { + "clip_ratio/high_max": 2.5114631171163637e-05, + "clip_ratio/high_mean": 7.040741365926806e-06, + "clip_ratio/low_mean": 5.3607667723554187e-05, + "clip_ratio/low_min": 9.219345429301029e-06, + "clip_ratio/region_mean": 6.064840863473364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14986.0, + "completions/mean_length": 6407.5, + "completions/mean_terminated_length": 6249.14306640625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.9549195989966393, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024427250027656555, + "learning_rate": 1e-05, + "loss": 0.0795, + "num_tokens": 157606126.0, + "reward": 0.3515625, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0002305622911080718, + "sampling/sampling_logp_difference/max": 8.37498950958252, + "sampling/sampling_logp_difference/mean": 0.0192743968218565, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.928529067958152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.928529067958152e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6638.390625, + "completions/mean_terminated_length": 5901.328125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.9070822075009346, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002024515997618437, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 158474248.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999830722808838, + "sampling/importance_sampling_ratio/min": 0.0036068728659301996, + "sampling/sampling_logp_difference/max": 5.624914169311523, + "sampling/sampling_logp_difference/mean": 0.01955476775765419, + "step": 202 + }, + { + "clip_ratio/high_max": 8.365173471247545e-06, + "clip_ratio/high_mean": 2.091293367811886e-06, + "clip_ratio/low_mean": 4.1470637825113954e-05, + "clip_ratio/low_min": 4.027710474474588e-06, + "clip_ratio/region_mean": 4.356193130661268e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 7324.546875, + "completions/mean_terminated_length": 6878.99951171875, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9108889549970627, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022787705529481173, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 159434350.0, + "reward": 0.3359375, + "reward_std": 0.26515230536460876, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.03948089852929115, + "sampling/sampling_logp_difference/max": 3.231938362121582, + "sampling/sampling_logp_difference/mean": 0.019122496247291565, + "step": 203 + }, + { + "clip_ratio/high_max": 8.65733409227687e-06, + "clip_ratio/high_mean": 2.1643335230692173e-06, + "clip_ratio/low_mean": 3.456336048657249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.672769389595487e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13983.0, + "completions/mean_length": 5520.4453125, + "completions/mean_terminated_length": 5434.9052734375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.8982062339782715, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026195270475000143, + "learning_rate": 1e-05, + "loss": 0.049, + "num_tokens": 160163055.0, + "reward": 0.4375, + "reward_std": 0.24831004440784454, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.0005541297141462564, + "sampling/sampling_logp_difference/max": 7.498111724853516, + "sampling/sampling_logp_difference/mean": 0.019064132124185562, + "step": 204 + }, + { + "clip_ratio/high_max": 1.8376186289970065e-05, + "clip_ratio/high_mean": 6.650576210631698e-06, + "clip_ratio/low_mean": 4.059042771586974e-05, + "clip_ratio/low_min": 5.350111223378917e-06, + "clip_ratio/region_mean": 4.724100449493562e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15267.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 6846.515625, + "completions/mean_terminated_length": 6846.515625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9657742157578468, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014831912703812122, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 161057657.0, + "reward": 0.296875, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 6.252834282349795e-05, + "sampling/sampling_logp_difference/max": 9.679890632629395, + "sampling/sampling_logp_difference/mean": 0.020372584462165833, + "step": 205 + }, + { + "clip_ratio/high_max": 1.658901419432368e-05, + "clip_ratio/high_mean": 4.14725354858092e-06, + "clip_ratio/low_mean": 4.473214539757464e-05, + "clip_ratio/low_min": 2.9674999950657366e-06, + "clip_ratio/region_mean": 4.887939894615556e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16370.0, + "completions/mean_length": 6946.8984375, + "completions/mean_terminated_length": 6642.4755859375, + "completions/min_length": 1133.0, + "completions/min_terminated_length": 1133.0, + "entropy": 0.8490508273243904, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017962189158424735, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 161966356.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 7.035569433355704e-05, + "sampling/sampling_logp_difference/max": 9.561946868896484, + "sampling/sampling_logp_difference/mean": 0.019146796315908432, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.22491199540309e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.22491199540309e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15123.0, + "completions/mean_length": 6618.9765625, + "completions/mean_terminated_length": 6463.9765625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.9541772454977036, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017619321588426828, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 162836705.0, + "reward": 0.390625, + "reward_std": 0.2130674123764038, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07, + "sampling/sampling_logp_difference/max": 14.680485725402832, + "sampling/sampling_logp_difference/mean": 0.020236656069755554, + "step": 207 + }, + { + "clip_ratio/high_max": 1.6846054222696694e-05, + "clip_ratio/high_mean": 4.211513555674173e-06, + "clip_ratio/low_mean": 3.877300162002939e-05, + "clip_ratio/low_min": 4.230834292684449e-06, + "clip_ratio/region_mean": 4.298451551676408e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12469.0, + "completions/mean_length": 5485.71875, + "completions/mean_terminated_length": 5312.73046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.8888534903526306, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002670915797352791, + "learning_rate": 1e-05, + "loss": 0.0709, + "num_tokens": 163558197.0, + "reward": 0.46875, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.0005042250850237906, + "sampling/sampling_logp_difference/max": 7.592487812042236, + "sampling/sampling_logp_difference/mean": 0.019581373780965805, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6889288480779214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889288480779214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16184.0, + "completions/mean_length": 4345.171875, + "completions/mean_terminated_length": 4250.3779296875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.8308270424604416, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004005427472293377, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 164133499.0, + "reward": 0.578125, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247193336487, + "sampling/importance_sampling_ratio/min": 0.022981969639658928, + "sampling/sampling_logp_difference/max": 3.773045301437378, + "sampling/sampling_logp_difference/mean": 0.017508968710899353, + "step": 209 + }, + { + "clip_ratio/high_max": 1.2997116300539346e-05, + "clip_ratio/high_mean": 3.2492790751348366e-06, + "clip_ratio/low_mean": 2.723402121773688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0483300406558556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5227.296875, + "completions/mean_terminated_length": 5050.20654296875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 0.9231975972652435, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031033784616738558, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 164823681.0, + "reward": 0.4765625, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999896764755249, + "sampling/importance_sampling_ratio/min": 0.0021342060063034296, + "sampling/sampling_logp_difference/max": 6.149660587310791, + "sampling/sampling_logp_difference/mean": 0.019171088933944702, + "step": 210 + }, + { + "clip_ratio/high_max": 2.0835890609305352e-05, + "clip_ratio/high_mean": 5.208972652326338e-06, + "clip_ratio/low_mean": 2.9314877565411734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.452385044511175e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14160.0, + "completions/mean_length": 6473.4765625, + "completions/mean_terminated_length": 6316.1669921875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 0.9061874598264694, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003495733719319105, + "learning_rate": 1e-05, + "loss": 0.0785, + "num_tokens": 165668798.0, + "reward": 0.4765625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000354051589966, + "sampling/importance_sampling_ratio/min": 0.0004697878030128777, + "sampling/sampling_logp_difference/max": 7.663229465484619, + "sampling/sampling_logp_difference/mean": 0.018978482112288475, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991967162164656e-05, + "clip_ratio/low_min": 6.304534053924726e-06, + "clip_ratio/region_mean": 3.991967162164656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14659.0, + "completions/mean_length": 7140.1953125, + "completions/mean_terminated_length": 6605.4296875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.9605444446206093, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002381941769272089, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 166603375.0, + "reward": 0.3046875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 0.00043123820796608925, + "sampling/sampling_logp_difference/max": 7.748849868774414, + "sampling/sampling_logp_difference/mean": 0.021141134202480316, + "step": 212 + }, + { + "clip_ratio/high_max": 1.4948576790629886e-05, + "clip_ratio/high_mean": 3.7371441976574715e-06, + "clip_ratio/low_mean": 3.4953729482367635e-05, + "clip_ratio/low_min": 3.991060111729894e-06, + "clip_ratio/region_mean": 3.869087413477246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13770.0, + "completions/mean_length": 5304.46875, + "completions/mean_terminated_length": 5038.56005859375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9176690131425858, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040566748939454556, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 167302275.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999827742576599, + "sampling/importance_sampling_ratio/min": 5.001809313398553e-07, + "sampling/sampling_logp_difference/max": 14.508296012878418, + "sampling/sampling_logp_difference/mean": 0.018822530284523964, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.653866999935417e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.653866999935417e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5796.5, + "completions/mean_terminated_length": 5542.400390625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.9230027198791504, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021502040326595306, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 168063627.0, + "reward": 0.3828125, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223351478577, + "sampling/importance_sampling_ratio/min": 0.009504453279078007, + "sampling/sampling_logp_difference/max": 4.655994892120361, + "sampling/sampling_logp_difference/mean": 0.01985779032111168, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0863841453101486e-05, + "clip_ratio/high_mean": 2.7159603632753715e-06, + "clip_ratio/low_mean": 2.4175752741939505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6891713218901714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14814.0, + "completions/mean_length": 6135.4921875, + "completions/mean_terminated_length": 6054.79541015625, + "completions/min_length": 1259.0, + "completions/min_terminated_length": 1259.0, + "entropy": 0.869445689022541, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027786416467279196, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 168867858.0, + "reward": 0.4609375, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05, + "sampling/sampling_logp_difference/max": 10.553963661193848, + "sampling/sampling_logp_difference/mean": 0.018514130264520645, + "step": 215 + }, + { + "clip_ratio/high_max": 4.36788013757905e-06, + "clip_ratio/high_mean": 1.0919700343947625e-06, + "clip_ratio/low_mean": 1.993327998661698e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0852980330564606e-06, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 6268.2421875, + "completions/mean_terminated_length": 6025.46435546875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.951081782579422, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0007328780484385788, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 169689969.0, + "reward": 0.3828125, + "reward_std": 0.10994865000247955, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05, + "sampling/sampling_logp_difference/max": 11.003040313720703, + "sampling/sampling_logp_difference/mean": 0.02005261555314064, + "step": 216 + }, + { + "clip_ratio/high_max": 2.131336282218399e-05, + "clip_ratio/high_mean": 5.3283407055459975e-06, + "clip_ratio/low_mean": 3.5254403428552905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.058274430462916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13861.0, + "completions/mean_length": 5440.8984375, + "completions/mean_terminated_length": 5354.732421875, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.8271932750940323, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034721922129392624, + "learning_rate": 1e-05, + "loss": -0.0245, + "num_tokens": 170409292.0, + "reward": 0.53125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998912811279297, + "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05, + "sampling/sampling_logp_difference/max": 10.904656410217285, + "sampling/sampling_logp_difference/mean": 0.019136395305395126, + "step": 217 + }, + { + "clip_ratio/high_max": 1.2339016848272877e-05, + "clip_ratio/high_mean": 4.13687178024702e-06, + "clip_ratio/low_mean": 2.156280152121326e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.569967330146028e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15086.0, + "completions/mean_length": 6671.046875, + "completions/mean_terminated_length": 6594.56689453125, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.9659745842218399, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027575206477195024, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 171280714.0, + "reward": 0.375, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411702156067, + "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05, + "sampling/sampling_logp_difference/max": 11.06179428100586, + "sampling/sampling_logp_difference/mean": 0.019089506939053535, + "step": 218 + }, + { + "clip_ratio/high_max": 1.4603458112105727e-05, + "clip_ratio/high_mean": 3.650864528026432e-06, + "clip_ratio/low_mean": 3.2977761520669446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.662862599185246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7781.5546875, + "completions/mean_terminated_length": 7504.05615234375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 1.1691131889820099, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012711051385849714, + "learning_rate": 1e-05, + "loss": 0.0115, + "num_tokens": 172302489.0, + "reward": 0.109375, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998820424079895, + "sampling/importance_sampling_ratio/min": 0.005086081102490425, + "sampling/sampling_logp_difference/max": 5.281247615814209, + "sampling/sampling_logp_difference/mean": 0.023309212177991867, + "step": 219 + }, + { + "clip_ratio/high_max": 6.842087486802484e-06, + "clip_ratio/high_mean": 1.710521871700621e-06, + "clip_ratio/low_mean": 4.5269940528669395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6980462457213434e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14891.0, + "completions/mean_length": 6489.96875, + "completions/mean_terminated_length": 6332.9208984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9354017227888107, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016933141741901636, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 173149653.0, + "reward": 0.484375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 0.008998609147965908, + "sampling/sampling_logp_difference/max": 4.7106852531433105, + "sampling/sampling_logp_difference/mean": 0.019165027886629105, + "step": 220 + }, + { + "clip_ratio/high_max": 2.444740721330163e-05, + "clip_ratio/high_mean": 6.111851803325408e-06, + "clip_ratio/low_mean": 3.0998270403870265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.711012095664046e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14943.0, + "completions/max_terminated_length": 14943.0, + "completions/mean_length": 6309.75, + "completions/mean_terminated_length": 6309.75, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.012483686208725, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024940327275544405, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 173976797.0, + "reward": 0.4375, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 0.0018720829393714666, + "sampling/sampling_logp_difference/max": 6.280703544616699, + "sampling/sampling_logp_difference/mean": 0.020797956734895706, + "step": 221 + }, + { + "clip_ratio/high_max": 1.1112337460872368e-05, + "clip_ratio/high_mean": 3.5388877677178243e-06, + "clip_ratio/low_mean": 1.7024583712554886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.056347148027271e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7574.984375, + "completions/mean_terminated_length": 7363.568359375, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9144782647490501, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002748408354818821, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 174965259.0, + "reward": 0.2734375, + "reward_std": 0.25224411487579346, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000108480453491, + "sampling/importance_sampling_ratio/min": 0.005681300535798073, + "sampling/sampling_logp_difference/max": 5.170575141906738, + "sampling/sampling_logp_difference/mean": 0.019229793921113014, + "step": 222 + }, + { + "clip_ratio/high_max": 1.4946090004741563e-05, + "clip_ratio/high_mean": 3.736522501185391e-06, + "clip_ratio/low_mean": 3.722507381098694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096159636901575e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6962.7734375, + "completions/mean_terminated_length": 6499.43408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9248140156269073, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020343128126114607, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 175876446.0, + "reward": 0.421875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.0001609467581147328, + "sampling/sampling_logp_difference/max": 8.734436988830566, + "sampling/sampling_logp_difference/mean": 0.01860032044351101, + "step": 223 + }, + { + "clip_ratio/high_max": 4.226114015182247e-06, + "clip_ratio/high_mean": 1.0565285037955618e-06, + "clip_ratio/low_mean": 3.189400638348161e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295053488727717e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14978.0, + "completions/mean_length": 6422.28125, + "completions/mean_terminated_length": 6264.1591796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.7786787301301956, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029119597747921944, + "learning_rate": 1e-05, + "loss": 0.1116, + "num_tokens": 176717226.0, + "reward": 0.578125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.0006287595024332404, + "sampling/sampling_logp_difference/max": 7.371761798858643, + "sampling/sampling_logp_difference/mean": 0.01786171644926071, + "step": 224 + }, + { + "clip_ratio/high_max": 5.4112551879370585e-06, + "clip_ratio/high_mean": 1.3528137969842646e-06, + "clip_ratio/low_mean": 2.103693077515345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2389744572137715e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6662.65625, + "completions/mean_terminated_length": 6508.349609375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9501350447535515, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0027519147843122482, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 177586766.0, + "reward": 0.421875, + "reward_std": 0.21382881700992584, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000051259994507, + "sampling/importance_sampling_ratio/min": 2.507045428501442e-05, + "sampling/sampling_logp_difference/max": 10.593820571899414, + "sampling/sampling_logp_difference/mean": 0.020679686218500137, + "step": 225 + }, + { + "clip_ratio/high_max": 3.2487785119883483e-06, + "clip_ratio/high_mean": 8.121946279970871e-07, + "clip_ratio/low_mean": 5.783435085504607e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8646545539886574e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 6546.171875, + "completions/mean_terminated_length": 6146.259765625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9217342138290405, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017936143558472395, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178444556.0, + "reward": 0.3984375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 8.447741129202768e-05, + "sampling/sampling_logp_difference/max": 9.379026412963867, + "sampling/sampling_logp_difference/mean": 0.019764548167586327, + "step": 226 + }, + { + "clip_ratio/high_max": 2.1980493102091714e-05, + "clip_ratio/high_mean": 5.4951232755229285e-06, + "clip_ratio/low_mean": 4.3977801396977156e-05, + "clip_ratio/low_min": 7.912247156127705e-06, + "clip_ratio/region_mean": 4.947292427459615e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15707.0, + "completions/max_terminated_length": 15707.0, + "completions/mean_length": 6433.9296875, + "completions/mean_terminated_length": 6433.9296875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9361409991979599, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031324021983891726, + "learning_rate": 1e-05, + "loss": 0.0505, + "num_tokens": 179288499.0, + "reward": 0.453125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.00018488657951820642, + "sampling/sampling_logp_difference/max": 8.595767974853516, + "sampling/sampling_logp_difference/mean": 0.019691072404384613, + "step": 227 + }, + { + "clip_ratio/high_max": 1.299416817346355e-05, + "clip_ratio/high_mean": 3.2485420433658874e-06, + "clip_ratio/low_mean": 3.756406420052372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.081260635757644e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15787.0, + "completions/mean_length": 6037.75, + "completions/mean_terminated_length": 5873.52392578125, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8700985535979271, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024714914616197348, + "learning_rate": 1e-05, + "loss": 0.0044, + "num_tokens": 180079619.0, + "reward": 0.484375, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999628067016602, + "sampling/importance_sampling_ratio/min": 8.4841696661897e-05, + "sampling/sampling_logp_difference/max": 9.374723434448242, + "sampling/sampling_logp_difference/mean": 0.018519341945648193, + "step": 228 + }, + { + "clip_ratio/high_max": 7.293307589861797e-06, + "clip_ratio/high_mean": 1.8233268974654493e-06, + "clip_ratio/low_mean": 2.2305866423266707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.412919320704532e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12264.0, + "completions/max_terminated_length": 12264.0, + "completions/mean_length": 5305.828125, + "completions/mean_terminated_length": 5305.828125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 1.1309608668088913, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003593914210796356, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 180780877.0, + "reward": 0.3984375, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.009941472671926022, + "sampling/sampling_logp_difference/max": 4.611040115356445, + "sampling/sampling_logp_difference/mean": 0.020471621304750443, + "step": 229 + }, + { + "clip_ratio/high_max": 2.0163415001661633e-05, + "clip_ratio/high_mean": 5.040853750415408e-06, + "clip_ratio/low_mean": 4.4980357415624894e-05, + "clip_ratio/low_min": 1.0012816346716136e-05, + "clip_ratio/region_mean": 5.0021211109196884e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13814.0, + "completions/mean_length": 6022.96875, + "completions/mean_terminated_length": 5774.30419921875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8560900762677193, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029816587921231985, + "learning_rate": 1e-05, + "loss": 0.0913, + "num_tokens": 181571465.0, + "reward": 0.515625, + "reward_std": 0.41504397988319397, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05, + "sampling/sampling_logp_difference/max": 11.04552936553955, + "sampling/sampling_logp_difference/mean": 0.0181986466050148, + "step": 230 + }, + { + "clip_ratio/high_max": 1.8430865566188004e-05, + "clip_ratio/high_mean": 6.177042905619601e-06, + "clip_ratio/low_mean": 4.450247388376738e-05, + "clip_ratio/low_min": 4.840271230932558e-06, + "clip_ratio/region_mean": 5.067951724413433e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15130.0, + "completions/max_terminated_length": 15130.0, + "completions/mean_length": 6647.71875, + "completions/mean_terminated_length": 6647.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9455481320619583, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0031632622703909874, + "learning_rate": 1e-05, + "loss": 0.1317, + "num_tokens": 182440957.0, + "reward": 0.3828125, + "reward_std": 0.39902517199516296, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05, + "sampling/sampling_logp_difference/max": 11.124979019165039, + "sampling/sampling_logp_difference/mean": 0.01906408555805683, + "step": 231 + }, + { + "clip_ratio/high_max": 2.2937053017813014e-05, + "clip_ratio/high_mean": 5.7342632544532535e-06, + "clip_ratio/low_mean": 6.042617155799235e-05, + "clip_ratio/low_min": 1.1000354334100848e-05, + "clip_ratio/region_mean": 6.616043401663774e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15988.0, + "completions/mean_length": 6809.1640625, + "completions/mean_terminated_length": 6500.29833984375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 1.050546184182167, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00162694591563195, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 183332242.0, + "reward": 0.421875, + "reward_std": 0.33616161346435547, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000290870666504, + "sampling/importance_sampling_ratio/min": 4.244970114086755e-06, + "sampling/sampling_logp_difference/max": 12.369775772094727, + "sampling/sampling_logp_difference/mean": 0.021866722032427788, + "step": 232 + }, + { + "clip_ratio/high_max": 1.4678411844215589e-05, + "clip_ratio/high_mean": 3.669602961053897e-06, + "clip_ratio/low_mean": 2.4373607971028832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8043211159456405e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6815.5, + "completions/mean_terminated_length": 6506.83837890625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 1.060033954679966, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024887355975806713, + "learning_rate": 1e-05, + "loss": 0.1059, + "num_tokens": 184225138.0, + "reward": 0.328125, + "reward_std": 0.2869548499584198, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999393820762634, + "sampling/importance_sampling_ratio/min": 0.00012930770753882825, + "sampling/sampling_logp_difference/max": 8.953315734863281, + "sampling/sampling_logp_difference/mean": 0.02019432932138443, + "step": 233 + }, + { + "clip_ratio/high_max": 7.910891326901037e-06, + "clip_ratio/high_mean": 1.9777228317252593e-06, + "clip_ratio/low_mean": 3.8802519611635944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.078024221598753e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6928.4453125, + "completions/mean_terminated_length": 6623.42724609375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9051575735211372, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002783838426694274, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 185136323.0, + "reward": 0.3359375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05, + "sampling/sampling_logp_difference/max": 11.498395919799805, + "sampling/sampling_logp_difference/mean": 0.01905050128698349, + "step": 234 + }, + { + "clip_ratio/high_max": 4.399394583742833e-06, + "clip_ratio/high_mean": 1.0998486459357082e-06, + "clip_ratio/low_mean": 1.733424267058581e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8434091430208355e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14703.0, + "completions/mean_length": 7155.1328125, + "completions/mean_terminated_length": 7082.46435546875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 1.0119014978408813, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002105508930981159, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 186071324.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999904990196228, + "sampling/importance_sampling_ratio/min": 0.003494206117466092, + "sampling/sampling_logp_difference/max": 5.656649112701416, + "sampling/sampling_logp_difference/mean": 0.020860780030488968, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0561529961705673e-05, + "clip_ratio/high_mean": 3.4390433256703545e-06, + "clip_ratio/low_mean": 2.8499469067355676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193851205196552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16176.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7463.2421875, + "completions/mean_terminated_length": 7463.2421875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.9983502700924873, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013582308311015368, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 187045035.0, + "reward": 0.3984375, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 0.000473080639494583, + "sampling/sampling_logp_difference/max": 7.65624475479126, + "sampling/sampling_logp_difference/mean": 0.021131811663508415, + "step": 236 + }, + { + "clip_ratio/high_max": 8.509013468938065e-06, + "clip_ratio/high_mean": 2.127253367234516e-06, + "clip_ratio/low_mean": 3.985050443588989e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.197775751890731e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14938.0, + "completions/mean_length": 6460.984375, + "completions/mean_terminated_length": 6382.8505859375, + "completions/min_length": 1747.0, + "completions/min_terminated_length": 1747.0, + "entropy": 0.7869217246770859, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002681629965081811, + "learning_rate": 1e-05, + "loss": 0.0987, + "num_tokens": 187889609.0, + "reward": 0.5234375, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0015037209959700704, + "sampling/sampling_logp_difference/max": 6.499812602996826, + "sampling/sampling_logp_difference/mean": 0.016937749460339546, + "step": 237 + }, + { + "clip_ratio/high_max": 1.2362176221358823e-05, + "clip_ratio/high_mean": 3.0905440553397057e-06, + "clip_ratio/low_mean": 5.0333514764133724e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342405825103924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 6241.78125, + "completions/mean_terminated_length": 6161.92138671875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.0217387825250626, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021239183843135834, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 188706605.0, + "reward": 0.2578125, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.004853047896176577, + "sampling/sampling_logp_difference/max": 5.328148365020752, + "sampling/sampling_logp_difference/mean": 0.02103862166404724, + "step": 238 + }, + { + "clip_ratio/high_max": 6.725130333506968e-06, + "clip_ratio/high_mean": 1.681282583376742e-06, + "clip_ratio/low_mean": 3.437372129155847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.605500387493521e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15332.0, + "completions/mean_length": 5638.1328125, + "completions/mean_terminated_length": 5553.51953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.7844365313649178, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023868419229984283, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 189446294.0, + "reward": 0.515625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0008047468145377934, + "sampling/sampling_logp_difference/max": 7.124982833862305, + "sampling/sampling_logp_difference/mean": 0.017401430755853653, + "step": 239 + }, + { + "clip_ratio/high_max": 2.887730215661577e-05, + "clip_ratio/high_mean": 7.219325539153942e-06, + "clip_ratio/low_mean": 2.826443028425274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.548375502759882e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16196.0, + "completions/mean_length": 6374.8046875, + "completions/mean_terminated_length": 6215.9287109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9472770467400551, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027549315709620714, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 190281461.0, + "reward": 0.3984375, + "reward_std": 0.3167053163051605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998682737350464, + "sampling/importance_sampling_ratio/min": 7.100860239006579e-05, + "sampling/sampling_logp_difference/max": 9.552709579467773, + "sampling/sampling_logp_difference/mean": 0.020243138074874878, + "step": 240 + }, + { + "clip_ratio/high_max": 1.586787766427733e-05, + "clip_ratio/high_mean": 3.9669694160693325e-06, + "clip_ratio/low_mean": 2.978218674343225e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.374915604581474e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15097.0, + "completions/mean_length": 6654.21875, + "completions/mean_terminated_length": 6499.88134765625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 1.0028243213891983, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013344973558560014, + "learning_rate": 1e-05, + "loss": 0.0184, + "num_tokens": 191156249.0, + "reward": 0.359375, + "reward_std": 0.22832971811294556, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 0.0021875568199902773, + "sampling/sampling_logp_difference/max": 6.124969959259033, + "sampling/sampling_logp_difference/mean": 0.020470600575208664, + "step": 241 + }, + { + "clip_ratio/high_max": 1.681529829511419e-05, + "clip_ratio/high_mean": 4.9954849146160996e-06, + "clip_ratio/low_mean": 2.040554932136729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5401033553862362e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6767.7890625, + "completions/mean_terminated_length": 6537.00048828125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9059296399354935, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016136945923790336, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 192040526.0, + "reward": 0.4921875, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05, + "sampling/sampling_logp_difference/max": 11.29355525970459, + "sampling/sampling_logp_difference/mean": 0.020058143883943558, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9821966563758906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9821966563758906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16275.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 6767.4921875, + "completions/mean_terminated_length": 6767.4921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 1.0446822568774223, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002869367366656661, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 192926469.0, + "reward": 0.3828125, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586343765259, + "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05, + "sampling/sampling_logp_difference/max": 10.853924751281738, + "sampling/sampling_logp_difference/mean": 0.021512050181627274, + "step": 243 + }, + { + "clip_ratio/high_max": 3.44581130775623e-05, + "clip_ratio/high_mean": 1.3001711295146379e-05, + "clip_ratio/low_mean": 3.6407937841431703e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.940964981869911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16261.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 5738.484375, + "completions/mean_terminated_length": 5738.484375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.8617956340312958, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002177527640014887, + "learning_rate": 1e-05, + "loss": -0.0189, + "num_tokens": 193678859.0, + "reward": 0.5546875, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.0008533780346624553, + "sampling/sampling_logp_difference/max": 7.06630802154541, + "sampling/sampling_logp_difference/mean": 0.018141131848096848, + "step": 244 + }, + { + "clip_ratio/high_max": 3.861003733618418e-06, + "clip_ratio/high_mean": 9.652509334046044e-07, + "clip_ratio/low_mean": 2.7767115511778684e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8732366558870126e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 6382.90625, + "completions/mean_terminated_length": 5976.357421875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.8692388981580734, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004127771593630314, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 194511847.0, + "reward": 0.4140625, + "reward_std": 0.2767002582550049, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06, + "sampling/sampling_logp_difference/max": 12.124691009521484, + "sampling/sampling_logp_difference/mean": 0.018376430496573448, + "step": 245 + }, + { + "clip_ratio/high_max": 9.728395525598899e-06, + "clip_ratio/high_mean": 2.4320988813997246e-06, + "clip_ratio/low_mean": 5.3631663831765763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.606376271316549e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14504.0, + "completions/max_terminated_length": 14504.0, + "completions/mean_length": 5776.15625, + "completions/mean_terminated_length": 5776.15625, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 1.1195004731416702, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00263008801266551, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 195270051.0, + "reward": 0.421875, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971866607666, + "sampling/importance_sampling_ratio/min": 0.005209421273320913, + "sampling/sampling_logp_difference/max": 5.257286548614502, + "sampling/sampling_logp_difference/mean": 0.019923292100429535, + "step": 246 + }, + { + "clip_ratio/high_max": 1.2701100786216557e-05, + "clip_ratio/high_mean": 3.1752751965541393e-06, + "clip_ratio/low_mean": 4.2162768181697174e-05, + "clip_ratio/low_min": 3.873926743835909e-06, + "clip_ratio/region_mean": 4.5338043378251314e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 7411.421875, + "completions/mean_terminated_length": 7196.08056640625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.9801053553819656, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002642859937623143, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 196240913.0, + "reward": 0.390625, + "reward_std": 0.27328529953956604, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999198913574219, + "sampling/importance_sampling_ratio/min": 0.00017500204558018595, + "sampling/sampling_logp_difference/max": 8.650712966918945, + "sampling/sampling_logp_difference/mean": 0.021511007100343704, + "step": 247 + }, + { + "clip_ratio/high_max": 1.5122936929401476e-05, + "clip_ratio/high_mean": 3.780734232350369e-06, + "clip_ratio/low_mean": 6.367217611114029e-05, + "clip_ratio/low_min": 4.8010447244450916e-06, + "clip_ratio/region_mean": 6.745291057086433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16127.0, + "completions/mean_length": 7944.65625, + "completions/mean_terminated_length": 7742.1123046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 1.0132562816143036, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002439325675368309, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 197278517.0, + "reward": 0.34375, + "reward_std": 0.3161812424659729, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05, + "sampling/sampling_logp_difference/max": 11.49896240234375, + "sampling/sampling_logp_difference/mean": 0.02124868705868721, + "step": 248 + }, + { + "clip_ratio/high_max": 2.6017536356448545e-05, + "clip_ratio/high_mean": 6.504384089112136e-06, + "clip_ratio/low_mean": 3.7791321346958284e-05, + "clip_ratio/low_min": 3.2110563097376144e-06, + "clip_ratio/region_mean": 4.429570503816649e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 7550.0, + "completions/mean_terminated_length": 7409.7783203125, + "completions/min_length": 1469.0, + "completions/min_terminated_length": 1469.0, + "entropy": 1.0384011715650558, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014879995724186301, + "learning_rate": 1e-05, + "loss": 0.0338, + "num_tokens": 198265589.0, + "reward": 0.3359375, + "reward_std": 0.24040167033672333, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468922615051, + "sampling/importance_sampling_ratio/min": 8.418659126618877e-05, + "sampling/sampling_logp_difference/max": 9.382474899291992, + "sampling/sampling_logp_difference/mean": 0.021503347903490067, + "step": 249 + }, + { + "clip_ratio/high_max": 1.3615457191917812e-05, + "clip_ratio/high_mean": 4.491880531531933e-06, + "clip_ratio/low_mean": 3.916533574965797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.365721684962409e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 8140.9140625, + "completions/mean_terminated_length": 7517.48779296875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.8718572407960892, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002340668346732855, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 199324938.0, + "reward": 0.453125, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.002325017238035798, + "sampling/sampling_logp_difference/max": 6.064027786254883, + "sampling/sampling_logp_difference/mean": 0.019466478377580643, + "step": 250 + }, + { + "clip_ratio/high_max": 2.2175697040438536e-05, + "clip_ratio/high_mean": 5.543924260109634e-06, + "clip_ratio/low_mean": 4.1318608055007644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.686253225827386e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16263.0, + "completions/mean_length": 6630.96875, + "completions/mean_terminated_length": 6396.896484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.7798146530985832, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001989356242120266, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 200189902.0, + "reward": 0.5625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474883079529, + "sampling/importance_sampling_ratio/min": 0.0003315774374641478, + "sampling/sampling_logp_difference/max": 8.011649131774902, + "sampling/sampling_logp_difference/mean": 0.01849902793765068, + "step": 251 + }, + { + "clip_ratio/high_max": 3.325706302348408e-06, + "clip_ratio/high_mean": 8.31426575587102e-07, + "clip_ratio/low_mean": 2.0285911205064622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.111733795118198e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15357.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 6582.203125, + "completions/mean_terminated_length": 6582.203125, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 1.0181676000356674, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002594445599243045, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 201052832.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.0003853558446280658, + "sampling/sampling_logp_difference/max": 7.8613433837890625, + "sampling/sampling_logp_difference/mean": 0.021598614752292633, + "step": 252 + }, + { + "clip_ratio/high_max": 2.2044430352252675e-05, + "clip_ratio/high_mean": 5.511107588063169e-06, + "clip_ratio/low_mean": 3.4155824209847196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96669319115972e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14540.0, + "completions/max_terminated_length": 14540.0, + "completions/mean_length": 6145.1796875, + "completions/mean_terminated_length": 6145.1796875, + "completions/min_length": 1098.0, + "completions/min_terminated_length": 1098.0, + "entropy": 0.9084350541234016, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003104996867477894, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 201858047.0, + "reward": 0.5078125, + "reward_std": 0.33220985531806946, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.007650630082935095, + "sampling/sampling_logp_difference/max": 4.87296724319458, + "sampling/sampling_logp_difference/mean": 0.018979094922542572, + "step": 253 + }, + { + "clip_ratio/high_max": 2.9959978519400465e-05, + "clip_ratio/high_mean": 7.489994629850116e-06, + "clip_ratio/low_mean": 3.5255963325653283e-05, + "clip_ratio/low_min": 2.973075879708631e-06, + "clip_ratio/region_mean": 4.274595892184152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15745.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7259.953125, + "completions/mean_terminated_length": 7259.953125, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 0.9823614731431007, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003212577663362026, + "learning_rate": 1e-05, + "loss": 0.0133, + "num_tokens": 202807673.0, + "reward": 0.4765625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999860405921936, + "sampling/importance_sampling_ratio/min": 0.000536504783667624, + "sampling/sampling_logp_difference/max": 7.530435085296631, + "sampling/sampling_logp_difference/mean": 0.021432969719171524, + "step": 254 + }, + { + "clip_ratio/high_max": 3.273996276220714e-05, + "clip_ratio/high_mean": 9.095591565255745e-06, + "clip_ratio/low_mean": 2.9539680099333054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8635271948805894e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7258.71875, + "completions/mean_terminated_length": 7113.87353515625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8823810070753098, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001418307889252901, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 203757333.0, + "reward": 0.40625, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884963035583, + "sampling/importance_sampling_ratio/min": 0.0006408974295482039, + "sampling/sampling_logp_difference/max": 7.3526411056518555, + "sampling/sampling_logp_difference/mean": 0.019296500831842422, + "step": 255 + }, + { + "clip_ratio/high_max": 1.544119368190877e-05, + "clip_ratio/high_mean": 3.860298420477193e-06, + "clip_ratio/low_mean": 3.755458698151415e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.141488631148604e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7011.40625, + "completions/mean_terminated_length": 6386.56689453125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8057166337966919, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001652427832596004, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 204675065.0, + "reward": 0.46875, + "reward_std": 0.24146251380443573, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.015319154597818851, + "sampling/sampling_logp_difference/max": 4.178651332855225, + "sampling/sampling_logp_difference/mean": 0.018787402659654617, + "step": 256 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 204675065, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-256/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/README.md b/dapo_lora_plus_20251202_001141/checkpoint-320/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/latest b/dapo_lora_plus_20251202_001141/checkpoint-320/latest new file mode 100644 index 0000000000000000000000000000000000000000..9d535587efdab3121736d8095481e4143f000213 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/latest @@ -0,0 +1 @@ +global_step320 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8eaf34db285507204c1e5ddd562e56437a34ba41 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/trainer_state.json @@ -0,0 +1,9954 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.29438822447102114, + "eval_steps": 500, + "global_step": 320, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + }, + { + "clip_ratio/high_max": 1.8439853647578275e-05, + "clip_ratio/high_mean": 4.609963411894569e-06, + "clip_ratio/low_mean": 5.708034223061986e-05, + "clip_ratio/low_min": 2.75287948170444e-06, + "clip_ratio/region_mean": 6.169030598357494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15081.0, + "completions/mean_length": 6565.359375, + "completions/mean_terminated_length": 6488.04736328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1013468354940414, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019073591101914644, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 97539453.0, + "reward": 0.2734375, + "reward_std": 0.307217001914978, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999555945396423, + "sampling/importance_sampling_ratio/min": 0.0006022047018632293, + "sampling/sampling_logp_difference/max": 7.414913177490234, + "sampling/sampling_logp_difference/mean": 0.02150837704539299, + "step": 129 + }, + { + "clip_ratio/high_max": 9.068485269381199e-06, + "clip_ratio/high_mean": 2.2671213173452998e-06, + "clip_ratio/low_mean": 1.9822365402433206e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.208948649240483e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 6779.6171875, + "completions/mean_terminated_length": 6703.9921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8940552547574043, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010163087863475084, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 98429036.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 3.464699460664633e-08, + "sampling/sampling_logp_difference/max": 17.178054809570312, + "sampling/sampling_logp_difference/mean": 0.018716152757406235, + "step": 130 + }, + { + "clip_ratio/high_max": 5.047242211730918e-06, + "clip_ratio/high_mean": 1.2618105529327295e-06, + "clip_ratio/low_mean": 2.9014110396019532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0275920835265424e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14549.0, + "completions/max_terminated_length": 14549.0, + "completions/mean_length": 5766.71875, + "completions/mean_terminated_length": 5766.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 1.0455922111868858, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002155766822397709, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 99184264.0, + "reward": 0.4140625, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253749847412, + "sampling/importance_sampling_ratio/min": 0.00010798005678225309, + "sampling/sampling_logp_difference/max": 9.133563995361328, + "sampling/sampling_logp_difference/mean": 0.020948775112628937, + "step": 131 + }, + { + "clip_ratio/high_max": 2.0882574972347356e-05, + "clip_ratio/high_mean": 6.505383225885453e-06, + "clip_ratio/low_mean": 4.496008500609605e-05, + "clip_ratio/low_min": 7.757854064038838e-06, + "clip_ratio/region_mean": 5.1465468231981504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14704.0, + "completions/mean_length": 6167.2421875, + "completions/mean_terminated_length": 6005.07177734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.9100174158811569, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021464223973453045, + "learning_rate": 1e-05, + "loss": -0.0279, + "num_tokens": 99996831.0, + "reward": 0.421875, + "reward_std": 0.3916535973548889, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240040779114, + "sampling/importance_sampling_ratio/min": 0.02249590866267681, + "sampling/sampling_logp_difference/max": 3.794421911239624, + "sampling/sampling_logp_difference/mean": 0.01866895705461502, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0998018473837874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0998018473837874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15738.0, + "completions/mean_length": 6242.9453125, + "completions/mean_terminated_length": 6163.09423828125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8624134212732315, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023277695290744305, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 100814112.0, + "reward": 0.3984375, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999959409236908, + "sampling/importance_sampling_ratio/min": 0.0002393616596236825, + "sampling/sampling_logp_difference/max": 8.33753490447998, + "sampling/sampling_logp_difference/mean": 0.0191188994795084, + "step": 133 + }, + { + "clip_ratio/high_max": 6.589872555196052e-06, + "clip_ratio/high_mean": 1.647468138799013e-06, + "clip_ratio/low_mean": 4.329304238126497e-05, + "clip_ratio/low_min": 3.5120251595799346e-06, + "clip_ratio/region_mean": 4.494051017900347e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5733.6875, + "completions/mean_terminated_length": 5478.080078125, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9628067463636398, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003547821193933487, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 101566264.0, + "reward": 0.3984375, + "reward_std": 0.36584997177124023, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0001282035664189607, + "sampling/sampling_logp_difference/max": 8.961891174316406, + "sampling/sampling_logp_difference/mean": 0.019646761938929558, + "step": 134 + }, + { + "clip_ratio/high_max": 1.7107527582993498e-05, + "clip_ratio/high_mean": 4.2768818957483745e-06, + "clip_ratio/low_mean": 3.014796902789385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.442485103732906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15848.0, + "completions/max_terminated_length": 15848.0, + "completions/mean_length": 5505.9375, + "completions/mean_terminated_length": 5505.9375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.8041045889258385, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024891747161746025, + "learning_rate": 1e-05, + "loss": 0.1406, + "num_tokens": 102291456.0, + "reward": 0.5, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 0.0014627616619691253, + "sampling/sampling_logp_difference/max": 6.527429103851318, + "sampling/sampling_logp_difference/mean": 0.01716250739991665, + "step": 135 + }, + { + "clip_ratio/high_max": 1.548903105685895e-05, + "clip_ratio/high_mean": 3.872257764214737e-06, + "clip_ratio/low_mean": 5.380711581892683e-05, + "clip_ratio/low_min": 4.5777483137499075e-06, + "clip_ratio/region_mean": 5.767937363998499e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16005.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 5003.0625, + "completions/mean_terminated_length": 5003.0625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.9115714654326439, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00220683915540576, + "learning_rate": 1e-05, + "loss": 0.1361, + "num_tokens": 102949824.0, + "reward": 0.4140625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 8.323705696966499e-05, + "sampling/sampling_logp_difference/max": 9.393817901611328, + "sampling/sampling_logp_difference/mean": 0.018076512962579727, + "step": 136 + }, + { + "clip_ratio/high_max": 2.181136096623959e-05, + "clip_ratio/high_mean": 5.4528402415598975e-06, + "clip_ratio/low_mean": 3.4416837252138066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986967681157694e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15658.0, + "completions/max_terminated_length": 15658.0, + "completions/mean_length": 4742.1328125, + "completions/mean_terminated_length": 4742.1328125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.9430246204137802, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003964806906878948, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 103580913.0, + "reward": 0.4609375, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 7.031940185697749e-05, + "sampling/sampling_logp_difference/max": 9.56246280670166, + "sampling/sampling_logp_difference/mean": 0.019651200622320175, + "step": 137 + }, + { + "clip_ratio/high_max": 4.07684046876966e-06, + "clip_ratio/high_mean": 1.019210117192415e-06, + "clip_ratio/low_mean": 3.8682398553646635e-05, + "clip_ratio/low_min": 8.189203072106466e-06, + "clip_ratio/region_mean": 3.970160832977854e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6574.171875, + "completions/mean_terminated_length": 6091.72119140625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8429529070854187, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002067410387098789, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 104447463.0, + "reward": 0.3125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997583627700806, + "sampling/importance_sampling_ratio/min": 0.00021258489869069308, + "sampling/sampling_logp_difference/max": 8.456169128417969, + "sampling/sampling_logp_difference/mean": 0.018853647634387016, + "step": 138 + }, + { + "clip_ratio/high_max": 1.9725823221961036e-05, + "clip_ratio/high_mean": 4.931455805490259e-06, + "clip_ratio/low_mean": 5.9263072444082354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.419452870431996e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15518.0, + "completions/max_terminated_length": 15518.0, + "completions/mean_length": 4581.5625, + "completions/mean_terminated_length": 4581.5625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.7094272822141647, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004292502999305725, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 105052287.0, + "reward": 0.625, + "reward_std": 0.3908300995826721, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.0019342642044648528, + "sampling/sampling_logp_difference/max": 6.24802827835083, + "sampling/sampling_logp_difference/mean": 0.016310662031173706, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0132298029930098e-05, + "clip_ratio/high_mean": 2.5330745074825245e-06, + "clip_ratio/low_mean": 4.6397121650443296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.893019581686531e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16097.0, + "completions/mean_length": 7066.4453125, + "completions/mean_terminated_length": 6918.5478515625, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8481669947504997, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015785128343850374, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 105977048.0, + "reward": 0.3515625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.00104097044095397, + "sampling/sampling_logp_difference/max": 6.8676018714904785, + "sampling/sampling_logp_difference/mean": 0.018304405733942986, + "step": 140 + }, + { + "clip_ratio/high_max": 1.6989023606583942e-05, + "clip_ratio/high_mean": 4.2472559016459854e-06, + "clip_ratio/low_mean": 2.3075059743860038e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7322315418132348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16104.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 6230.5234375, + "completions/mean_terminated_length": 6230.5234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9658062160015106, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002542720176279545, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 106793187.0, + "reward": 0.3203125, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.0002781494113150984, + "sampling/sampling_logp_difference/max": 8.187352180480957, + "sampling/sampling_logp_difference/mean": 0.019391046836972237, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7597974508353218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7597974508353218e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14216.0, + "completions/mean_length": 5690.5546875, + "completions/mean_terminated_length": 5606.3544921875, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 1.0098655670881271, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001451602904126048, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 107539874.0, + "reward": 0.4296875, + "reward_std": 0.23304283618927002, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999307990074158, + "sampling/importance_sampling_ratio/min": 5.640022671116185e-09, + "sampling/sampling_logp_difference/max": 18.993377685546875, + "sampling/sampling_logp_difference/mean": 0.018607191741466522, + "step": 142 + }, + { + "clip_ratio/high_max": 1.2800467629858758e-05, + "clip_ratio/high_mean": 4.19954119479371e-06, + "clip_ratio/low_mean": 2.350350996493944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.770305115973315e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15791.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5471.1328125, + "completions/mean_terminated_length": 5471.1328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0413162112236023, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023549250327050686, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 108260091.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999832510948181, + "sampling/importance_sampling_ratio/min": 0.0011709182290360332, + "sampling/sampling_logp_difference/max": 6.749967098236084, + "sampling/sampling_logp_difference/mean": 0.020427243784070015, + "step": 143 + }, + { + "clip_ratio/high_max": 2.1983064925734652e-05, + "clip_ratio/high_mean": 5.495766231433663e-06, + "clip_ratio/low_mean": 4.361141452591255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9107180757346214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6211.7421875, + "completions/mean_terminated_length": 6050.2783203125, + "completions/min_length": 622.0, + "completions/min_terminated_length": 622.0, + "entropy": 0.9706784337759018, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017527056625112891, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 109073890.0, + "reward": 0.421875, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.002898645820096135, + "sampling/sampling_logp_difference/max": 5.843511581420898, + "sampling/sampling_logp_difference/mean": 0.018898162990808487, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.208964992358233e-05, + "clip_ratio/low_min": 3.9168990042526275e-06, + "clip_ratio/region_mean": 4.208964992358233e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14880.0, + "completions/mean_length": 6007.8984375, + "completions/mean_terminated_length": 5926.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.1967609524726868, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007858420140109956, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 109861813.0, + "reward": 0.296875, + "reward_std": 0.23486506938934326, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 3.294382011631569e-08, + "sampling/sampling_logp_difference/max": 17.22846221923828, + "sampling/sampling_logp_difference/mean": 0.021845955401659012, + "step": 145 + }, + { + "clip_ratio/high_max": 4.5118208618077915e-06, + "clip_ratio/high_mean": 1.1279552154519479e-06, + "clip_ratio/low_mean": 3.749712686840212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8625082197540905e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6800.9921875, + "completions/mean_terminated_length": 6725.53564453125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.0437887012958527, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029428249690681696, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 110756572.0, + "reward": 0.265625, + "reward_std": 0.3248382806777954, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.0006329434108920395, + "sampling/sampling_logp_difference/max": 7.365129470825195, + "sampling/sampling_logp_difference/mean": 0.02010120078921318, + "step": 146 + }, + { + "clip_ratio/high_max": 1.427700522071973e-05, + "clip_ratio/high_mean": 3.5692513051799324e-06, + "clip_ratio/low_mean": 4.964020990883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320946092979284e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6309.4453125, + "completions/mean_terminated_length": 6230.1181640625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.9768906533718109, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002088683657348156, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 111585493.0, + "reward": 0.375, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.009723234921693802, + "sampling/sampling_logp_difference/max": 4.633236885070801, + "sampling/sampling_logp_difference/mean": 0.020927833393216133, + "step": 147 + }, + { + "clip_ratio/high_max": 5.4841398196003865e-06, + "clip_ratio/high_mean": 1.3710349549000966e-06, + "clip_ratio/low_mean": 5.122006064084417e-05, + "clip_ratio/low_min": 3.785125954891555e-06, + "clip_ratio/region_mean": 5.25910957094311e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15209.0, + "completions/mean_length": 6221.859375, + "completions/mean_terminated_length": 6060.5556640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.9212924689054489, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002406956860795617, + "learning_rate": 1e-05, + "loss": 0.1051, + "num_tokens": 112400363.0, + "reward": 0.40625, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05, + "sampling/sampling_logp_difference/max": 9.74976634979248, + "sampling/sampling_logp_difference/mean": 0.018652018159627914, + "step": 148 + }, + { + "clip_ratio/high_max": 1.4568151755156578e-05, + "clip_ratio/high_mean": 3.6420379387891444e-06, + "clip_ratio/low_mean": 3.999794398623635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3639981413434725e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14997.0, + "completions/mean_length": 6942.8203125, + "completions/mean_terminated_length": 6716.232421875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.949538916349411, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022962254006415606, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 113308748.0, + "reward": 0.375, + "reward_std": 0.3329663872718811, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.00048810525913722813, + "sampling/sampling_logp_difference/max": 7.624979496002197, + "sampling/sampling_logp_difference/mean": 0.01939917355775833, + "step": 149 + }, + { + "clip_ratio/high_max": 8.786732450971613e-06, + "clip_ratio/high_mean": 2.196683112742903e-06, + "clip_ratio/low_mean": 5.562954720517155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7826231113722315e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15182.0, + "completions/mean_length": 6783.1796875, + "completions/mean_terminated_length": 6552.76025390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9774708449840546, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020560629200190306, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 114196235.0, + "reward": 0.34375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998990297317505, + "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07, + "sampling/sampling_logp_difference/max": 15.211536407470703, + "sampling/sampling_logp_difference/mean": 0.019691556692123413, + "step": 150 + }, + { + "clip_ratio/high_max": 1.799483243303257e-05, + "clip_ratio/high_mean": 4.498708108258143e-06, + "clip_ratio/low_mean": 2.6389980291696702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0888688343111426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15549.0, + "completions/mean_length": 5568.15625, + "completions/mean_terminated_length": 5396.4765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.9303529411554337, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022214846685528755, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 114928047.0, + "reward": 0.234375, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05, + "sampling/sampling_logp_difference/max": 10.749968528747559, + "sampling/sampling_logp_difference/mean": 0.01938418298959732, + "step": 151 + }, + { + "clip_ratio/high_max": 1.1957493370573502e-05, + "clip_ratio/high_mean": 2.9893733426433755e-06, + "clip_ratio/low_mean": 5.885063319510664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.184000585562899e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15340.0, + "completions/max_terminated_length": 15340.0, + "completions/mean_length": 6086.578125, + "completions/mean_terminated_length": 6086.578125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.9131873697042465, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002448044717311859, + "learning_rate": 1e-05, + "loss": 0.0599, + "num_tokens": 115725657.0, + "reward": 0.40625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999779462814331, + "sampling/importance_sampling_ratio/min": 0.02929726243019104, + "sampling/sampling_logp_difference/max": 3.530261278152466, + "sampling/sampling_logp_difference/mean": 0.019298439845442772, + "step": 152 + }, + { + "clip_ratio/high_max": 1.3385357760853367e-05, + "clip_ratio/high_mean": 3.3463394402133417e-06, + "clip_ratio/low_mean": 5.717015119444113e-05, + "clip_ratio/low_min": 3.4328400033700746e-06, + "clip_ratio/region_mean": 6.0516490520967636e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 6442.5390625, + "completions/mean_terminated_length": 6203.9443359375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.8959419652819633, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002013204852119088, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 116571478.0, + "reward": 0.2734375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06, + "sampling/sampling_logp_difference/max": 13.778777122497559, + "sampling/sampling_logp_difference/mean": 0.01925014518201351, + "step": 153 + }, + { + "clip_ratio/high_max": 9.34224021875707e-06, + "clip_ratio/high_mean": 3.136903728773177e-06, + "clip_ratio/low_mean": 2.9738095065567904e-05, + "clip_ratio/low_min": 3.7240065466903616e-06, + "clip_ratio/region_mean": 3.2874999135401595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6633.5703125, + "completions/mean_terminated_length": 6319.0400390625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0223619118332863, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024523327592760324, + "learning_rate": 1e-05, + "loss": 0.056, + "num_tokens": 117440743.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05, + "sampling/sampling_logp_difference/max": 10.413415908813477, + "sampling/sampling_logp_difference/mean": 0.02061290666460991, + "step": 154 + }, + { + "clip_ratio/high_max": 1.4537483366439119e-05, + "clip_ratio/high_mean": 3.6343708416097797e-06, + "clip_ratio/low_mean": 3.954866042477079e-05, + "clip_ratio/low_min": 9.874949228105834e-06, + "clip_ratio/region_mean": 4.318303126638057e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15919.0, + "completions/mean_length": 7183.0, + "completions/mean_terminated_length": 6886.193359375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.9815369099378586, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018688985146582127, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 118380687.0, + "reward": 0.2890625, + "reward_std": 0.2498900145292282, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039173126221, + "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05, + "sampling/sampling_logp_difference/max": 11.187394142150879, + "sampling/sampling_logp_difference/mean": 0.019792160019278526, + "step": 155 + }, + { + "clip_ratio/high_max": 7.165636361605721e-06, + "clip_ratio/high_mean": 1.7914090904014301e-06, + "clip_ratio/low_mean": 4.9011068711024564e-05, + "clip_ratio/low_min": 1.0991705721608014e-05, + "clip_ratio/region_mean": 5.0802477687739156e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6324.640625, + "completions/mean_terminated_length": 5829.91748046875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.852975606918335, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002005894435569644, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 119207089.0, + "reward": 0.3984375, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 5.788659223071591e-07, + "sampling/sampling_logp_difference/max": 14.362195014953613, + "sampling/sampling_logp_difference/mean": 0.01853565312922001, + "step": 156 + }, + { + "clip_ratio/high_max": 7.795394822096569e-06, + "clip_ratio/high_mean": 1.948848705524142e-06, + "clip_ratio/low_mean": 3.834237736555224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0291225786859286e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 5723.421875, + "completions/mean_terminated_length": 5290.06494140625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8744911625981331, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002577397273853421, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 119961895.0, + "reward": 0.390625, + "reward_std": 0.34321609139442444, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.07882421463727951, + "sampling/sampling_logp_difference/max": 2.5405349731445312, + "sampling/sampling_logp_difference/mean": 0.018341556191444397, + "step": 157 + }, + { + "clip_ratio/high_max": 9.214097190124448e-06, + "clip_ratio/high_mean": 2.303524297531112e-06, + "clip_ratio/low_mean": 2.636873176697918e-05, + "clip_ratio/low_min": 2.9339967113628518e-06, + "clip_ratio/region_mean": 2.8672255837136618e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7886.015625, + "completions/mean_terminated_length": 7682.064453125, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.9391767829656601, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002552987542003393, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 120990289.0, + "reward": 0.328125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.000899312668479979, + "sampling/sampling_logp_difference/max": 7.013879776000977, + "sampling/sampling_logp_difference/mean": 0.02049873024225235, + "step": 158 + }, + { + "clip_ratio/high_max": 3.406416203688423e-05, + "clip_ratio/high_mean": 9.72330332160709e-06, + "clip_ratio/low_mean": 3.168332909808669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140663151019908e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6173.1640625, + "completions/mean_terminated_length": 6011.087890625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.9148785546422005, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002678362652659416, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 121797958.0, + "reward": 0.4140625, + "reward_std": 0.3608373999595642, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.002013920107856393, + "sampling/sampling_logp_difference/max": 6.207672119140625, + "sampling/sampling_logp_difference/mean": 0.018977735191583633, + "step": 159 + }, + { + "clip_ratio/high_max": 1.8476588593330234e-05, + "clip_ratio/high_mean": 4.6191471483325586e-06, + "clip_ratio/low_mean": 4.459614581264759e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9215293188353826e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 6594.21875, + "completions/mean_terminated_length": 6196.259765625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9486038386821747, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033711253199726343, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 122661170.0, + "reward": 0.3828125, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981356620789, + "sampling/importance_sampling_ratio/min": 0.0002968419576063752, + "sampling/sampling_logp_difference/max": 8.122310638427734, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 160 + }, + { + "clip_ratio/high_max": 7.97335997049231e-06, + "clip_ratio/high_mean": 2.7343705824023345e-06, + "clip_ratio/low_mean": 5.420079878604156e-05, + "clip_ratio/low_min": 4.594068286678521e-06, + "clip_ratio/region_mean": 5.693517005056492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 6533.9453125, + "completions/mean_terminated_length": 6377.595703125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9986584335565567, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017857529455795884, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 123518107.0, + "reward": 0.34375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998549818992615, + "sampling/importance_sampling_ratio/min": 9.012701411847956e-06, + "sampling/sampling_logp_difference/max": 11.616875648498535, + "sampling/sampling_logp_difference/mean": 0.02010391652584076, + "step": 161 + }, + { + "clip_ratio/high_max": 4.470512521947967e-06, + "clip_ratio/high_mean": 1.1176281304869917e-06, + "clip_ratio/low_mean": 3.5141094485879876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.625872295742738e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13212.0, + "completions/mean_length": 5742.21875, + "completions/mean_terminated_length": 5658.42529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0379670709371567, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018227624241262674, + "learning_rate": 1e-05, + "loss": -0.0237, + "num_tokens": 124279031.0, + "reward": 0.21875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.0020977305248379707, + "sampling/sampling_logp_difference/max": 6.16689920425415, + "sampling/sampling_logp_difference/mean": 0.019987668842077255, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0003542683989508e-05, + "clip_ratio/high_mean": 3.21091931709816e-06, + "clip_ratio/low_mean": 5.731009014198207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0521009800140746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7584.703125, + "completions/mean_terminated_length": 7515.41748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.953459307551384, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002219022251665592, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 125270761.0, + "reward": 0.359375, + "reward_std": 0.37033066153526306, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880790710449, + "sampling/importance_sampling_ratio/min": 0.0024849213659763336, + "sampling/sampling_logp_difference/max": 5.997514247894287, + "sampling/sampling_logp_difference/mean": 0.020291510969400406, + "step": 163 + }, + { + "clip_ratio/high_max": 7.734669452474918e-06, + "clip_ratio/high_mean": 1.9336673631187296e-06, + "clip_ratio/low_mean": 3.1135301298945706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3068968605221016e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 4714.671875, + "completions/mean_terminated_length": 4622.78759765625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.018719919025898, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014189074281603098, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 125895279.0, + "reward": 0.3984375, + "reward_std": 0.28383445739746094, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479651451111, + "sampling/importance_sampling_ratio/min": 4.017410901724361e-07, + "sampling/sampling_logp_difference/max": 14.727458000183105, + "sampling/sampling_logp_difference/mean": 0.018739396706223488, + "step": 164 + }, + { + "clip_ratio/high_max": 1.0069575182569679e-05, + "clip_ratio/high_mean": 2.5173937956424197e-06, + "clip_ratio/low_mean": 3.824179225375701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0759185367278405e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15913.0, + "completions/mean_length": 6316.140625, + "completions/mean_terminated_length": 6074.51220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.9325072392821312, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001702460227534175, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 126722881.0, + "reward": 0.4609375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999539852142334, + "sampling/importance_sampling_ratio/min": 0.0012551364488899708, + "sampling/sampling_logp_difference/max": 6.680510997772217, + "sampling/sampling_logp_difference/mean": 0.01929408684372902, + "step": 165 + }, + { + "clip_ratio/high_max": 6.873041002108948e-06, + "clip_ratio/high_mean": 1.718260250527237e-06, + "clip_ratio/low_mean": 3.119859468370123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.291685527528898e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15832.0, + "completions/mean_length": 4687.140625, + "completions/mean_terminated_length": 4595.03955078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 1.0886607319116592, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032931750174611807, + "learning_rate": 1e-05, + "loss": 0.0078, + "num_tokens": 127341715.0, + "reward": 0.28125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.0019364450126886368, + "sampling/sampling_logp_difference/max": 6.246901512145996, + "sampling/sampling_logp_difference/mean": 0.020621225237846375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.773085250533768e-05, + "clip_ratio/high_mean": 4.43271312633442e-06, + "clip_ratio/low_mean": 4.30743207289197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7507033741567284e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14125.0, + "completions/mean_length": 5705.515625, + "completions/mean_terminated_length": 5449.232421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0523068830370903, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031696646474301815, + "learning_rate": 1e-05, + "loss": -0.0414, + "num_tokens": 128093597.0, + "reward": 0.1953125, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619126319885, + "sampling/importance_sampling_ratio/min": 3.197810656274669e-05, + "sampling/sampling_logp_difference/max": 10.350459098815918, + "sampling/sampling_logp_difference/mean": 0.021961934864521027, + "step": 167 + }, + { + "clip_ratio/high_max": 1.885905066956184e-05, + "clip_ratio/high_mean": 4.71476266739046e-06, + "clip_ratio/low_mean": 5.0530389898995054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.524515336219338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15958.0, + "completions/mean_length": 6214.4921875, + "completions/mean_terminated_length": 6053.07177734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.9371421113610268, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023704832419753075, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 128906948.0, + "reward": 0.40625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.0003354824730195105, + "sampling/sampling_logp_difference/max": 7.999940872192383, + "sampling/sampling_logp_difference/mean": 0.01882763020694256, + "step": 168 + }, + { + "clip_ratio/high_max": 3.042072216885572e-05, + "clip_ratio/high_mean": 7.60518054221393e-06, + "clip_ratio/low_mean": 4.5897569179942366e-05, + "clip_ratio/low_min": 8.727477506909054e-06, + "clip_ratio/region_mean": 5.3502750233747065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7127.0703125, + "completions/mean_terminated_length": 7054.18115234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.9854387491941452, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003370177699252963, + "learning_rate": 1e-05, + "loss": 0.1197, + "num_tokens": 129839813.0, + "reward": 0.359375, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999907910823822, + "sampling/importance_sampling_ratio/min": 1.077816432371037e-05, + "sampling/sampling_logp_difference/max": 11.43798828125, + "sampling/sampling_logp_difference/mean": 0.019736800342798233, + "step": 169 + }, + { + "clip_ratio/high_max": 2.1401074718596647e-05, + "clip_ratio/high_mean": 6.243764005375851e-06, + "clip_ratio/low_mean": 3.2797592325550795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.904135610355297e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 6566.2890625, + "completions/mean_terminated_length": 6330.6640625, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.7978609576821327, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026055986527353525, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 130698370.0, + "reward": 0.5, + "reward_std": 0.36295419931411743, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.00031152591691352427, + "sampling/sampling_logp_difference/max": 8.074028015136719, + "sampling/sampling_logp_difference/mean": 0.01787097379565239, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0564424403346493e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0564424403346493e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15576.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7186.2890625, + "completions/mean_terminated_length": 7186.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.0232757329940796, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0023866184055805206, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 131637439.0, + "reward": 0.2734375, + "reward_std": 0.2059282809495926, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207258224487, + "sampling/importance_sampling_ratio/min": 0.0007378471200354397, + "sampling/sampling_logp_difference/max": 7.211773872375488, + "sampling/sampling_logp_difference/mean": 0.02137116715312004, + "step": 171 + }, + { + "clip_ratio/high_max": 4.037900725961663e-05, + "clip_ratio/high_mean": 1.0094751814904157e-05, + "clip_ratio/low_mean": 5.8380828136250784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.847557995115494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 5591.5703125, + "completions/mean_terminated_length": 5420.26220703125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9335208311676979, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003491115989163518, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 132371816.0, + "reward": 0.5, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891459941864, + "sampling/importance_sampling_ratio/min": 0.00012356207298580557, + "sampling/sampling_logp_difference/max": 8.998766899108887, + "sampling/sampling_logp_difference/mean": 0.018760837614536285, + "step": 172 + }, + { + "clip_ratio/high_max": 2.8378776733006816e-06, + "clip_ratio/high_mean": 7.094694183251704e-07, + "clip_ratio/low_mean": 4.4085751369493664e-05, + "clip_ratio/low_min": 6.7955093072669115e-06, + "clip_ratio/region_mean": 4.4795220674132e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7152.3828125, + "completions/mean_terminated_length": 6930.82421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.1329835206270218, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002830669516697526, + "learning_rate": 1e-05, + "loss": 0.0526, + "num_tokens": 133307297.0, + "reward": 0.28125, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.00028047082014381886, + "sampling/sampling_logp_difference/max": 8.179040908813477, + "sampling/sampling_logp_difference/mean": 0.021548541262745857, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0150829439226072e-05, + "clip_ratio/high_mean": 2.537707359806518e-06, + "clip_ratio/low_mean": 3.4009618616437365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654732597624388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 7263.453125, + "completions/mean_terminated_length": 7118.68310546875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.092760555446148, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027821618132293224, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 134260107.0, + "reward": 0.3203125, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 7.832317351130769e-05, + "sampling/sampling_logp_difference/max": 9.454667091369629, + "sampling/sampling_logp_difference/mean": 0.022098438814282417, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0561876024439698e-05, + "clip_ratio/high_mean": 2.6404690061099245e-06, + "clip_ratio/low_mean": 1.6864279416495265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9504748649978865e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15388.0, + "completions/mean_length": 7088.8125, + "completions/mean_terminated_length": 6710.958984375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 1.0669445469975471, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007076738984324038, + "learning_rate": 1e-05, + "loss": -0.0197, + "num_tokens": 135186139.0, + "reward": 0.328125, + "reward_std": 0.20593319833278656, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 3.084653872065246e-05, + "sampling/sampling_logp_difference/max": 10.386486053466797, + "sampling/sampling_logp_difference/mean": 0.020075790584087372, + "step": 175 + }, + { + "clip_ratio/high_max": 7.095016371749807e-06, + "clip_ratio/high_mean": 1.7737540929374518e-06, + "clip_ratio/low_mean": 2.7592465016823553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.936621888238733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15626.0, + "completions/max_terminated_length": 15626.0, + "completions/mean_length": 5352.734375, + "completions/mean_terminated_length": 5352.734375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 1.0387161895632744, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0022445612121373415, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 135888929.0, + "reward": 0.4765625, + "reward_std": 0.399257630109787, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 0.00032565294532105327, + "sampling/sampling_logp_difference/max": 8.029678344726562, + "sampling/sampling_logp_difference/mean": 0.02010166086256504, + "step": 176 + }, + { + "clip_ratio/high_max": 1.5100852124305675e-05, + "clip_ratio/high_mean": 4.426987970873597e-06, + "clip_ratio/low_mean": 2.7625993425317574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2052981168817496e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 7758.90625, + "completions/mean_terminated_length": 7408.29248046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 1.0648984238505363, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022021254990249872, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 136901941.0, + "reward": 0.3671875, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858140945435, + "sampling/importance_sampling_ratio/min": 2.2461865967216e-07, + "sampling/sampling_logp_difference/max": 15.30886173248291, + "sampling/sampling_logp_difference/mean": 0.021426808089017868, + "step": 177 + }, + { + "clip_ratio/high_max": 2.5346608254039893e-05, + "clip_ratio/high_mean": 7.4063813144675805e-06, + "clip_ratio/low_mean": 2.2069365058996482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9475746259777225e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 7036.953125, + "completions/mean_terminated_length": 6496.21484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9684997871518135, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013461806811392307, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 137824623.0, + "reward": 0.34375, + "reward_std": 0.2546031177043915, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 5.834372132085264e-05, + "sampling/sampling_logp_difference/max": 9.74915885925293, + "sampling/sampling_logp_difference/mean": 0.020304443314671516, + "step": 178 + }, + { + "clip_ratio/high_max": 1.3147734080121154e-05, + "clip_ratio/high_mean": 3.2869335200302885e-06, + "clip_ratio/low_mean": 4.841489999307669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.170183294467279e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6114.1875, + "completions/mean_terminated_length": 5951.1748046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.943072073161602, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132438588887453, + "learning_rate": 1e-05, + "loss": 0.0943, + "num_tokens": 138625247.0, + "reward": 0.40625, + "reward_std": 0.321650892496109, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999298453330994, + "sampling/importance_sampling_ratio/min": 0.0017275095451623201, + "sampling/sampling_logp_difference/max": 6.361074447631836, + "sampling/sampling_logp_difference/mean": 0.020084267482161522, + "step": 179 + }, + { + "clip_ratio/high_max": 1.7873157958092634e-05, + "clip_ratio/high_mean": 4.468289489523158e-06, + "clip_ratio/low_mean": 3.5252990301160025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9721279790683184e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15050.0, + "completions/mean_length": 7618.875, + "completions/mean_terminated_length": 7034.53369140625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.9142575263977051, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026741649489849806, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 139619287.0, + "reward": 0.2890625, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.005949751473963261, + "sampling/sampling_logp_difference/max": 5.124405860900879, + "sampling/sampling_logp_difference/mean": 0.020061582326889038, + "step": 180 + }, + { + "clip_ratio/high_max": 1.0512151675357018e-05, + "clip_ratio/high_mean": 2.6280379188392544e-06, + "clip_ratio/low_mean": 4.5301517502593924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.792955542143318e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16106.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 5333.875, + "completions/mean_terminated_length": 5333.875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "entropy": 0.8107482865452766, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027016003150492907, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 140318935.0, + "reward": 0.5703125, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.006856904830783606, + "sampling/sampling_logp_difference/max": 4.982499122619629, + "sampling/sampling_logp_difference/mean": 0.017069874331355095, + "step": 181 + }, + { + "clip_ratio/high_max": 1.85085939392593e-05, + "clip_ratio/high_mean": 5.24943533264377e-06, + "clip_ratio/low_mean": 5.6120721524166584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.137015702734061e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7443.3046875, + "completions/mean_terminated_length": 7154.89501953125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 0.9224414080381393, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002655779244378209, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 141293534.0, + "reward": 0.234375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.00042018835665658116, + "sampling/sampling_logp_difference/max": 7.774807453155518, + "sampling/sampling_logp_difference/mean": 0.02006504125893116, + "step": 182 + }, + { + "clip_ratio/high_max": 1.494229445597739e-05, + "clip_ratio/high_mean": 3.7355736139943474e-06, + "clip_ratio/low_mean": 2.2748562741981004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6484136355975352e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 5646.6875, + "completions/mean_terminated_length": 5646.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8945339694619179, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016281780553981662, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 142037438.0, + "reward": 0.46875, + "reward_std": 0.17912296950817108, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.0005717006279155612, + "sampling/sampling_logp_difference/max": 7.46689510345459, + "sampling/sampling_logp_difference/mean": 0.019336247816681862, + "step": 183 + }, + { + "clip_ratio/high_max": 3.335990868436056e-05, + "clip_ratio/high_mean": 8.33997717109014e-06, + "clip_ratio/low_mean": 3.5050728683927446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339070608239126e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14142.0, + "completions/mean_length": 6384.640625, + "completions/mean_terminated_length": 5892.86865234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.840093269944191, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002166559686884284, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 142873848.0, + "reward": 0.4765625, + "reward_std": 0.35506346821784973, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 4.785555574926548e-06, + "sampling/sampling_logp_difference/max": 12.249908447265625, + "sampling/sampling_logp_difference/mean": 0.018109092488884926, + "step": 184 + }, + { + "clip_ratio/high_max": 1.541105484648142e-05, + "clip_ratio/high_mean": 3.852763711620355e-06, + "clip_ratio/low_mean": 4.0552770769863855e-05, + "clip_ratio/low_min": 7.133888630050933e-06, + "clip_ratio/region_mean": 4.440553459517105e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14828.0, + "completions/mean_length": 5775.0, + "completions/mean_terminated_length": 5691.46435546875, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "entropy": 0.8915362879633904, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021932912059128284, + "learning_rate": 1e-05, + "loss": -0.0086, + "num_tokens": 143636152.0, + "reward": 0.4375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 9.714113069492214e-09, + "sampling/sampling_logp_difference/max": 18.44968605041504, + "sampling/sampling_logp_difference/mean": 0.019278086721897125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7509142171311396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7509142171311396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6181.640625, + "completions/mean_terminated_length": 6019.69873046875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 1.0544511675834656, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022947140969336033, + "learning_rate": 1e-05, + "loss": 0.0242, + "num_tokens": 144447370.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147653579712, + "sampling/importance_sampling_ratio/min": 7.419757253046555e-08, + "sampling/sampling_logp_difference/max": 16.416534423828125, + "sampling/sampling_logp_difference/mean": 0.02050788700580597, + "step": 186 + }, + { + "clip_ratio/high_max": 1.5700999938417226e-05, + "clip_ratio/high_mean": 3.9252499846043065e-06, + "clip_ratio/low_mean": 2.4595847037289786e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8521096965050674e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 6542.3046875, + "completions/mean_terminated_length": 6306.1044921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.933225467801094, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034910975955426693, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 145303505.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.007213745731860399, + "sampling/sampling_logp_difference/max": 4.931766986846924, + "sampling/sampling_logp_difference/mean": 0.020022759214043617, + "step": 187 + }, + { + "clip_ratio/high_max": 6.0999414017715026e-06, + "clip_ratio/high_mean": 1.5249853504428756e-06, + "clip_ratio/low_mean": 2.61421698724007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7667155109156738e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 5889.4765625, + "completions/mean_terminated_length": 5637.6083984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.9649673849344254, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024078311398625374, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 146082198.0, + "reward": 0.3359375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999341368675232, + "sampling/importance_sampling_ratio/min": 0.0008680344326421618, + "sampling/sampling_logp_difference/max": 7.04927921295166, + "sampling/sampling_logp_difference/mean": 0.02060198038816452, + "step": 188 + }, + { + "clip_ratio/high_max": 7.789618393871933e-06, + "clip_ratio/high_mean": 1.9474045984679833e-06, + "clip_ratio/low_mean": 3.6395756637830345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.834316100892465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5349.2421875, + "completions/mean_terminated_length": 5084.408203125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8402756005525589, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021191861014813185, + "learning_rate": 1e-05, + "loss": 0.1275, + "num_tokens": 146786245.0, + "reward": 0.4765625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 3.763807762879878e-05, + "sampling/sampling_logp_difference/max": 10.187494277954102, + "sampling/sampling_logp_difference/mean": 0.017112664878368378, + "step": 189 + }, + { + "clip_ratio/high_max": 1.2461773394534248e-05, + "clip_ratio/high_mean": 3.115443348633562e-06, + "clip_ratio/low_mean": 5.095924211673264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4074685294835945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 7272.3203125, + "completions/mean_terminated_length": 7053.64013671875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.9627499282360077, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022120666690170765, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 147737086.0, + "reward": 0.2890625, + "reward_std": 0.27304792404174805, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05, + "sampling/sampling_logp_difference/max": 10.984610557556152, + "sampling/sampling_logp_difference/mean": 0.0203307643532753, + "step": 190 + }, + { + "clip_ratio/high_max": 1.7891727566166082e-05, + "clip_ratio/high_mean": 4.472931891541521e-06, + "clip_ratio/low_mean": 5.616715043288423e-05, + "clip_ratio/low_min": 7.80031223257538e-06, + "clip_ratio/region_mean": 6.064008221073891e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 6387.1875, + "completions/mean_terminated_length": 5895.54052734375, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.9110158830881119, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030851473566144705, + "learning_rate": 1e-05, + "loss": 0.1091, + "num_tokens": 148573782.0, + "reward": 0.40625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.003961040172725916, + "sampling/sampling_logp_difference/max": 5.531248569488525, + "sampling/sampling_logp_difference/mean": 0.018049638718366623, + "step": 191 + }, + { + "clip_ratio/high_max": 1.6994396901282016e-05, + "clip_ratio/high_mean": 5.400205964178895e-06, + "clip_ratio/low_mean": 3.274822392995702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8148429439388565e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7267.59375, + "completions/mean_terminated_length": 7195.81103515625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.9254888147115707, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020694085396826267, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 149521258.0, + "reward": 0.2734375, + "reward_std": 0.29719972610473633, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 7.411616934405174e-06, + "sampling/sampling_logp_difference/max": 11.812461853027344, + "sampling/sampling_logp_difference/mean": 0.01898832805454731, + "step": 192 + }, + { + "clip_ratio/high_max": 4.10414668294834e-06, + "clip_ratio/high_mean": 1.026036670737085e-06, + "clip_ratio/low_mean": 4.7441100377909606e-05, + "clip_ratio/low_min": 4.552241534838686e-06, + "clip_ratio/region_mean": 4.8467136821273016e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16076.0, + "completions/mean_length": 7100.1953125, + "completions/mean_terminated_length": 6952.83349609375, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.8455610796809196, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003085972974076867, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 150447923.0, + "reward": 0.25, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.0011708807433024049, + "sampling/sampling_logp_difference/max": 6.749999046325684, + "sampling/sampling_logp_difference/mean": 0.01974140852689743, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6514521121280268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6514521121280268e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15535.0, + "completions/mean_length": 6626.4296875, + "completions/mean_terminated_length": 6549.5986328125, + "completions/min_length": 1746.0, + "completions/min_terminated_length": 1746.0, + "entropy": 1.0323699787259102, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003505800850689411, + "learning_rate": 1e-05, + "loss": 0.0885, + "num_tokens": 151313834.0, + "reward": 0.390625, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381303787231, + "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05, + "sampling/sampling_logp_difference/max": 10.479642868041992, + "sampling/sampling_logp_difference/mean": 0.021082937717437744, + "step": 194 + }, + { + "clip_ratio/high_max": 2.006086378969485e-05, + "clip_ratio/high_mean": 5.890002398700744e-06, + "clip_ratio/low_mean": 3.503898199141986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.092898473118112e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 7093.109375, + "completions/mean_terminated_length": 6870.12841796875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 1.0206764563918114, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002495395252481103, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 152238192.0, + "reward": 0.2890625, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 9.536534344078973e-05, + "sampling/sampling_logp_difference/max": 9.257795333862305, + "sampling/sampling_logp_difference/mean": 0.020610272884368896, + "step": 195 + }, + { + "clip_ratio/high_max": 3.2352409107261337e-06, + "clip_ratio/high_mean": 8.088102276815334e-07, + "clip_ratio/low_mean": 4.056704699451075e-05, + "clip_ratio/low_min": 1.1648833606159315e-05, + "clip_ratio/region_mean": 4.1375856994818605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14191.0, + "completions/mean_length": 6795.71875, + "completions/mean_terminated_length": 6486.4189453125, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "entropy": 0.8927837759256363, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014066790463402867, + "learning_rate": 1e-05, + "loss": -0.0031, + "num_tokens": 153131828.0, + "reward": 0.3359375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 5.093755135021638e-06, + "sampling/sampling_logp_difference/max": 12.187495231628418, + "sampling/sampling_logp_difference/mean": 0.01874586008489132, + "step": 196 + }, + { + "clip_ratio/high_max": 1.5244630048982799e-05, + "clip_ratio/high_mean": 3.8111575122456998e-06, + "clip_ratio/low_mean": 3.655197178886738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.03631290737394e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15831.0, + "completions/mean_length": 7075.1015625, + "completions/mean_terminated_length": 6617.28662109375, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.8989318311214447, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017937121447175741, + "learning_rate": 1e-05, + "loss": 0.0359, + "num_tokens": 154057097.0, + "reward": 0.3984375, + "reward_std": 0.23068872094154358, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.00021659507183358073, + "sampling/sampling_logp_difference/max": 8.437480926513672, + "sampling/sampling_logp_difference/mean": 0.01890135183930397, + "step": 197 + }, + { + "clip_ratio/high_max": 1.4074375030759256e-05, + "clip_ratio/high_mean": 4.977033995601232e-06, + "clip_ratio/low_mean": 3.2670792506905855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.764782627513341e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14100.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6743.41455078125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.8758384585380554, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410576842725277, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 154988585.0, + "reward": 0.3984375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.003589102067053318, + "sampling/sampling_logp_difference/max": 5.629853248596191, + "sampling/sampling_logp_difference/mean": 0.018400676548480988, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.977112736994968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.977112736994968e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6590.6796875, + "completions/mean_terminated_length": 6513.56689453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9243742749094963, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003304310142993927, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 155851000.0, + "reward": 0.3984375, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579787254333, + "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06, + "sampling/sampling_logp_difference/max": 13.576997756958008, + "sampling/sampling_logp_difference/mean": 0.01959652081131935, + "step": 199 + }, + { + "clip_ratio/high_max": 1.1435367014200892e-05, + "clip_ratio/high_mean": 2.858841753550223e-06, + "clip_ratio/low_mean": 4.7742656533955596e-05, + "clip_ratio/low_min": 8.646529749967158e-06, + "clip_ratio/region_mean": 5.0601498060132144e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6999.484375, + "completions/mean_terminated_length": 6696.7578125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.843244343996048, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023830258287489414, + "learning_rate": 1e-05, + "loss": 0.1142, + "num_tokens": 156766782.0, + "reward": 0.359375, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635053634644, + "sampling/importance_sampling_ratio/min": 0.00014761318743694574, + "sampling/sampling_logp_difference/max": 8.820915222167969, + "sampling/sampling_logp_difference/mean": 0.018434934318065643, + "step": 200 + }, + { + "clip_ratio/high_max": 2.5114631171163637e-05, + "clip_ratio/high_mean": 7.040741365926806e-06, + "clip_ratio/low_mean": 5.3607667723554187e-05, + "clip_ratio/low_min": 9.219345429301029e-06, + "clip_ratio/region_mean": 6.064840863473364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14986.0, + "completions/mean_length": 6407.5, + "completions/mean_terminated_length": 6249.14306640625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.9549195989966393, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024427250027656555, + "learning_rate": 1e-05, + "loss": 0.0795, + "num_tokens": 157606126.0, + "reward": 0.3515625, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0002305622911080718, + "sampling/sampling_logp_difference/max": 8.37498950958252, + "sampling/sampling_logp_difference/mean": 0.0192743968218565, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.928529067958152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.928529067958152e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6638.390625, + "completions/mean_terminated_length": 5901.328125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.9070822075009346, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002024515997618437, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 158474248.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999830722808838, + "sampling/importance_sampling_ratio/min": 0.0036068728659301996, + "sampling/sampling_logp_difference/max": 5.624914169311523, + "sampling/sampling_logp_difference/mean": 0.01955476775765419, + "step": 202 + }, + { + "clip_ratio/high_max": 8.365173471247545e-06, + "clip_ratio/high_mean": 2.091293367811886e-06, + "clip_ratio/low_mean": 4.1470637825113954e-05, + "clip_ratio/low_min": 4.027710474474588e-06, + "clip_ratio/region_mean": 4.356193130661268e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 7324.546875, + "completions/mean_terminated_length": 6878.99951171875, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9108889549970627, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022787705529481173, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 159434350.0, + "reward": 0.3359375, + "reward_std": 0.26515230536460876, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.03948089852929115, + "sampling/sampling_logp_difference/max": 3.231938362121582, + "sampling/sampling_logp_difference/mean": 0.019122496247291565, + "step": 203 + }, + { + "clip_ratio/high_max": 8.65733409227687e-06, + "clip_ratio/high_mean": 2.1643335230692173e-06, + "clip_ratio/low_mean": 3.456336048657249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.672769389595487e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13983.0, + "completions/mean_length": 5520.4453125, + "completions/mean_terminated_length": 5434.9052734375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.8982062339782715, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026195270475000143, + "learning_rate": 1e-05, + "loss": 0.049, + "num_tokens": 160163055.0, + "reward": 0.4375, + "reward_std": 0.24831004440784454, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.0005541297141462564, + "sampling/sampling_logp_difference/max": 7.498111724853516, + "sampling/sampling_logp_difference/mean": 0.019064132124185562, + "step": 204 + }, + { + "clip_ratio/high_max": 1.8376186289970065e-05, + "clip_ratio/high_mean": 6.650576210631698e-06, + "clip_ratio/low_mean": 4.059042771586974e-05, + "clip_ratio/low_min": 5.350111223378917e-06, + "clip_ratio/region_mean": 4.724100449493562e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15267.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 6846.515625, + "completions/mean_terminated_length": 6846.515625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9657742157578468, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014831912703812122, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 161057657.0, + "reward": 0.296875, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 6.252834282349795e-05, + "sampling/sampling_logp_difference/max": 9.679890632629395, + "sampling/sampling_logp_difference/mean": 0.020372584462165833, + "step": 205 + }, + { + "clip_ratio/high_max": 1.658901419432368e-05, + "clip_ratio/high_mean": 4.14725354858092e-06, + "clip_ratio/low_mean": 4.473214539757464e-05, + "clip_ratio/low_min": 2.9674999950657366e-06, + "clip_ratio/region_mean": 4.887939894615556e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16370.0, + "completions/mean_length": 6946.8984375, + "completions/mean_terminated_length": 6642.4755859375, + "completions/min_length": 1133.0, + "completions/min_terminated_length": 1133.0, + "entropy": 0.8490508273243904, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017962189158424735, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 161966356.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 7.035569433355704e-05, + "sampling/sampling_logp_difference/max": 9.561946868896484, + "sampling/sampling_logp_difference/mean": 0.019146796315908432, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.22491199540309e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.22491199540309e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15123.0, + "completions/mean_length": 6618.9765625, + "completions/mean_terminated_length": 6463.9765625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.9541772454977036, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017619321588426828, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 162836705.0, + "reward": 0.390625, + "reward_std": 0.2130674123764038, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07, + "sampling/sampling_logp_difference/max": 14.680485725402832, + "sampling/sampling_logp_difference/mean": 0.020236656069755554, + "step": 207 + }, + { + "clip_ratio/high_max": 1.6846054222696694e-05, + "clip_ratio/high_mean": 4.211513555674173e-06, + "clip_ratio/low_mean": 3.877300162002939e-05, + "clip_ratio/low_min": 4.230834292684449e-06, + "clip_ratio/region_mean": 4.298451551676408e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12469.0, + "completions/mean_length": 5485.71875, + "completions/mean_terminated_length": 5312.73046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.8888534903526306, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002670915797352791, + "learning_rate": 1e-05, + "loss": 0.0709, + "num_tokens": 163558197.0, + "reward": 0.46875, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.0005042250850237906, + "sampling/sampling_logp_difference/max": 7.592487812042236, + "sampling/sampling_logp_difference/mean": 0.019581373780965805, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6889288480779214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889288480779214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16184.0, + "completions/mean_length": 4345.171875, + "completions/mean_terminated_length": 4250.3779296875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.8308270424604416, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004005427472293377, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 164133499.0, + "reward": 0.578125, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247193336487, + "sampling/importance_sampling_ratio/min": 0.022981969639658928, + "sampling/sampling_logp_difference/max": 3.773045301437378, + "sampling/sampling_logp_difference/mean": 0.017508968710899353, + "step": 209 + }, + { + "clip_ratio/high_max": 1.2997116300539346e-05, + "clip_ratio/high_mean": 3.2492790751348366e-06, + "clip_ratio/low_mean": 2.723402121773688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0483300406558556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5227.296875, + "completions/mean_terminated_length": 5050.20654296875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 0.9231975972652435, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031033784616738558, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 164823681.0, + "reward": 0.4765625, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999896764755249, + "sampling/importance_sampling_ratio/min": 0.0021342060063034296, + "sampling/sampling_logp_difference/max": 6.149660587310791, + "sampling/sampling_logp_difference/mean": 0.019171088933944702, + "step": 210 + }, + { + "clip_ratio/high_max": 2.0835890609305352e-05, + "clip_ratio/high_mean": 5.208972652326338e-06, + "clip_ratio/low_mean": 2.9314877565411734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.452385044511175e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14160.0, + "completions/mean_length": 6473.4765625, + "completions/mean_terminated_length": 6316.1669921875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 0.9061874598264694, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003495733719319105, + "learning_rate": 1e-05, + "loss": 0.0785, + "num_tokens": 165668798.0, + "reward": 0.4765625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000354051589966, + "sampling/importance_sampling_ratio/min": 0.0004697878030128777, + "sampling/sampling_logp_difference/max": 7.663229465484619, + "sampling/sampling_logp_difference/mean": 0.018978482112288475, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991967162164656e-05, + "clip_ratio/low_min": 6.304534053924726e-06, + "clip_ratio/region_mean": 3.991967162164656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14659.0, + "completions/mean_length": 7140.1953125, + "completions/mean_terminated_length": 6605.4296875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.9605444446206093, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002381941769272089, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 166603375.0, + "reward": 0.3046875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 0.00043123820796608925, + "sampling/sampling_logp_difference/max": 7.748849868774414, + "sampling/sampling_logp_difference/mean": 0.021141134202480316, + "step": 212 + }, + { + "clip_ratio/high_max": 1.4948576790629886e-05, + "clip_ratio/high_mean": 3.7371441976574715e-06, + "clip_ratio/low_mean": 3.4953729482367635e-05, + "clip_ratio/low_min": 3.991060111729894e-06, + "clip_ratio/region_mean": 3.869087413477246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13770.0, + "completions/mean_length": 5304.46875, + "completions/mean_terminated_length": 5038.56005859375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9176690131425858, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040566748939454556, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 167302275.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999827742576599, + "sampling/importance_sampling_ratio/min": 5.001809313398553e-07, + "sampling/sampling_logp_difference/max": 14.508296012878418, + "sampling/sampling_logp_difference/mean": 0.018822530284523964, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.653866999935417e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.653866999935417e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5796.5, + "completions/mean_terminated_length": 5542.400390625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.9230027198791504, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021502040326595306, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 168063627.0, + "reward": 0.3828125, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223351478577, + "sampling/importance_sampling_ratio/min": 0.009504453279078007, + "sampling/sampling_logp_difference/max": 4.655994892120361, + "sampling/sampling_logp_difference/mean": 0.01985779032111168, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0863841453101486e-05, + "clip_ratio/high_mean": 2.7159603632753715e-06, + "clip_ratio/low_mean": 2.4175752741939505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6891713218901714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14814.0, + "completions/mean_length": 6135.4921875, + "completions/mean_terminated_length": 6054.79541015625, + "completions/min_length": 1259.0, + "completions/min_terminated_length": 1259.0, + "entropy": 0.869445689022541, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027786416467279196, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 168867858.0, + "reward": 0.4609375, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05, + "sampling/sampling_logp_difference/max": 10.553963661193848, + "sampling/sampling_logp_difference/mean": 0.018514130264520645, + "step": 215 + }, + { + "clip_ratio/high_max": 4.36788013757905e-06, + "clip_ratio/high_mean": 1.0919700343947625e-06, + "clip_ratio/low_mean": 1.993327998661698e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0852980330564606e-06, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 6268.2421875, + "completions/mean_terminated_length": 6025.46435546875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.951081782579422, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0007328780484385788, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 169689969.0, + "reward": 0.3828125, + "reward_std": 0.10994865000247955, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05, + "sampling/sampling_logp_difference/max": 11.003040313720703, + "sampling/sampling_logp_difference/mean": 0.02005261555314064, + "step": 216 + }, + { + "clip_ratio/high_max": 2.131336282218399e-05, + "clip_ratio/high_mean": 5.3283407055459975e-06, + "clip_ratio/low_mean": 3.5254403428552905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.058274430462916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13861.0, + "completions/mean_length": 5440.8984375, + "completions/mean_terminated_length": 5354.732421875, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.8271932750940323, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034721922129392624, + "learning_rate": 1e-05, + "loss": -0.0245, + "num_tokens": 170409292.0, + "reward": 0.53125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998912811279297, + "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05, + "sampling/sampling_logp_difference/max": 10.904656410217285, + "sampling/sampling_logp_difference/mean": 0.019136395305395126, + "step": 217 + }, + { + "clip_ratio/high_max": 1.2339016848272877e-05, + "clip_ratio/high_mean": 4.13687178024702e-06, + "clip_ratio/low_mean": 2.156280152121326e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.569967330146028e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15086.0, + "completions/mean_length": 6671.046875, + "completions/mean_terminated_length": 6594.56689453125, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.9659745842218399, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027575206477195024, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 171280714.0, + "reward": 0.375, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411702156067, + "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05, + "sampling/sampling_logp_difference/max": 11.06179428100586, + "sampling/sampling_logp_difference/mean": 0.019089506939053535, + "step": 218 + }, + { + "clip_ratio/high_max": 1.4603458112105727e-05, + "clip_ratio/high_mean": 3.650864528026432e-06, + "clip_ratio/low_mean": 3.2977761520669446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.662862599185246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7781.5546875, + "completions/mean_terminated_length": 7504.05615234375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 1.1691131889820099, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012711051385849714, + "learning_rate": 1e-05, + "loss": 0.0115, + "num_tokens": 172302489.0, + "reward": 0.109375, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998820424079895, + "sampling/importance_sampling_ratio/min": 0.005086081102490425, + "sampling/sampling_logp_difference/max": 5.281247615814209, + "sampling/sampling_logp_difference/mean": 0.023309212177991867, + "step": 219 + }, + { + "clip_ratio/high_max": 6.842087486802484e-06, + "clip_ratio/high_mean": 1.710521871700621e-06, + "clip_ratio/low_mean": 4.5269940528669395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6980462457213434e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14891.0, + "completions/mean_length": 6489.96875, + "completions/mean_terminated_length": 6332.9208984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9354017227888107, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016933141741901636, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 173149653.0, + "reward": 0.484375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 0.008998609147965908, + "sampling/sampling_logp_difference/max": 4.7106852531433105, + "sampling/sampling_logp_difference/mean": 0.019165027886629105, + "step": 220 + }, + { + "clip_ratio/high_max": 2.444740721330163e-05, + "clip_ratio/high_mean": 6.111851803325408e-06, + "clip_ratio/low_mean": 3.0998270403870265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.711012095664046e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14943.0, + "completions/max_terminated_length": 14943.0, + "completions/mean_length": 6309.75, + "completions/mean_terminated_length": 6309.75, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.012483686208725, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024940327275544405, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 173976797.0, + "reward": 0.4375, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 0.0018720829393714666, + "sampling/sampling_logp_difference/max": 6.280703544616699, + "sampling/sampling_logp_difference/mean": 0.020797956734895706, + "step": 221 + }, + { + "clip_ratio/high_max": 1.1112337460872368e-05, + "clip_ratio/high_mean": 3.5388877677178243e-06, + "clip_ratio/low_mean": 1.7024583712554886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.056347148027271e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7574.984375, + "completions/mean_terminated_length": 7363.568359375, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9144782647490501, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002748408354818821, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 174965259.0, + "reward": 0.2734375, + "reward_std": 0.25224411487579346, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000108480453491, + "sampling/importance_sampling_ratio/min": 0.005681300535798073, + "sampling/sampling_logp_difference/max": 5.170575141906738, + "sampling/sampling_logp_difference/mean": 0.019229793921113014, + "step": 222 + }, + { + "clip_ratio/high_max": 1.4946090004741563e-05, + "clip_ratio/high_mean": 3.736522501185391e-06, + "clip_ratio/low_mean": 3.722507381098694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096159636901575e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6962.7734375, + "completions/mean_terminated_length": 6499.43408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9248140156269073, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020343128126114607, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 175876446.0, + "reward": 0.421875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.0001609467581147328, + "sampling/sampling_logp_difference/max": 8.734436988830566, + "sampling/sampling_logp_difference/mean": 0.01860032044351101, + "step": 223 + }, + { + "clip_ratio/high_max": 4.226114015182247e-06, + "clip_ratio/high_mean": 1.0565285037955618e-06, + "clip_ratio/low_mean": 3.189400638348161e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295053488727717e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14978.0, + "completions/mean_length": 6422.28125, + "completions/mean_terminated_length": 6264.1591796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.7786787301301956, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029119597747921944, + "learning_rate": 1e-05, + "loss": 0.1116, + "num_tokens": 176717226.0, + "reward": 0.578125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.0006287595024332404, + "sampling/sampling_logp_difference/max": 7.371761798858643, + "sampling/sampling_logp_difference/mean": 0.01786171644926071, + "step": 224 + }, + { + "clip_ratio/high_max": 5.4112551879370585e-06, + "clip_ratio/high_mean": 1.3528137969842646e-06, + "clip_ratio/low_mean": 2.103693077515345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2389744572137715e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6662.65625, + "completions/mean_terminated_length": 6508.349609375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9501350447535515, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0027519147843122482, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 177586766.0, + "reward": 0.421875, + "reward_std": 0.21382881700992584, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000051259994507, + "sampling/importance_sampling_ratio/min": 2.507045428501442e-05, + "sampling/sampling_logp_difference/max": 10.593820571899414, + "sampling/sampling_logp_difference/mean": 0.020679686218500137, + "step": 225 + }, + { + "clip_ratio/high_max": 3.2487785119883483e-06, + "clip_ratio/high_mean": 8.121946279970871e-07, + "clip_ratio/low_mean": 5.783435085504607e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8646545539886574e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 6546.171875, + "completions/mean_terminated_length": 6146.259765625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9217342138290405, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017936143558472395, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178444556.0, + "reward": 0.3984375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 8.447741129202768e-05, + "sampling/sampling_logp_difference/max": 9.379026412963867, + "sampling/sampling_logp_difference/mean": 0.019764548167586327, + "step": 226 + }, + { + "clip_ratio/high_max": 2.1980493102091714e-05, + "clip_ratio/high_mean": 5.4951232755229285e-06, + "clip_ratio/low_mean": 4.3977801396977156e-05, + "clip_ratio/low_min": 7.912247156127705e-06, + "clip_ratio/region_mean": 4.947292427459615e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15707.0, + "completions/max_terminated_length": 15707.0, + "completions/mean_length": 6433.9296875, + "completions/mean_terminated_length": 6433.9296875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9361409991979599, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031324021983891726, + "learning_rate": 1e-05, + "loss": 0.0505, + "num_tokens": 179288499.0, + "reward": 0.453125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.00018488657951820642, + "sampling/sampling_logp_difference/max": 8.595767974853516, + "sampling/sampling_logp_difference/mean": 0.019691072404384613, + "step": 227 + }, + { + "clip_ratio/high_max": 1.299416817346355e-05, + "clip_ratio/high_mean": 3.2485420433658874e-06, + "clip_ratio/low_mean": 3.756406420052372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.081260635757644e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15787.0, + "completions/mean_length": 6037.75, + "completions/mean_terminated_length": 5873.52392578125, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8700985535979271, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024714914616197348, + "learning_rate": 1e-05, + "loss": 0.0044, + "num_tokens": 180079619.0, + "reward": 0.484375, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999628067016602, + "sampling/importance_sampling_ratio/min": 8.4841696661897e-05, + "sampling/sampling_logp_difference/max": 9.374723434448242, + "sampling/sampling_logp_difference/mean": 0.018519341945648193, + "step": 228 + }, + { + "clip_ratio/high_max": 7.293307589861797e-06, + "clip_ratio/high_mean": 1.8233268974654493e-06, + "clip_ratio/low_mean": 2.2305866423266707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.412919320704532e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12264.0, + "completions/max_terminated_length": 12264.0, + "completions/mean_length": 5305.828125, + "completions/mean_terminated_length": 5305.828125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 1.1309608668088913, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003593914210796356, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 180780877.0, + "reward": 0.3984375, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.009941472671926022, + "sampling/sampling_logp_difference/max": 4.611040115356445, + "sampling/sampling_logp_difference/mean": 0.020471621304750443, + "step": 229 + }, + { + "clip_ratio/high_max": 2.0163415001661633e-05, + "clip_ratio/high_mean": 5.040853750415408e-06, + "clip_ratio/low_mean": 4.4980357415624894e-05, + "clip_ratio/low_min": 1.0012816346716136e-05, + "clip_ratio/region_mean": 5.0021211109196884e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13814.0, + "completions/mean_length": 6022.96875, + "completions/mean_terminated_length": 5774.30419921875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8560900762677193, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029816587921231985, + "learning_rate": 1e-05, + "loss": 0.0913, + "num_tokens": 181571465.0, + "reward": 0.515625, + "reward_std": 0.41504397988319397, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05, + "sampling/sampling_logp_difference/max": 11.04552936553955, + "sampling/sampling_logp_difference/mean": 0.0181986466050148, + "step": 230 + }, + { + "clip_ratio/high_max": 1.8430865566188004e-05, + "clip_ratio/high_mean": 6.177042905619601e-06, + "clip_ratio/low_mean": 4.450247388376738e-05, + "clip_ratio/low_min": 4.840271230932558e-06, + "clip_ratio/region_mean": 5.067951724413433e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15130.0, + "completions/max_terminated_length": 15130.0, + "completions/mean_length": 6647.71875, + "completions/mean_terminated_length": 6647.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9455481320619583, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0031632622703909874, + "learning_rate": 1e-05, + "loss": 0.1317, + "num_tokens": 182440957.0, + "reward": 0.3828125, + "reward_std": 0.39902517199516296, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05, + "sampling/sampling_logp_difference/max": 11.124979019165039, + "sampling/sampling_logp_difference/mean": 0.01906408555805683, + "step": 231 + }, + { + "clip_ratio/high_max": 2.2937053017813014e-05, + "clip_ratio/high_mean": 5.7342632544532535e-06, + "clip_ratio/low_mean": 6.042617155799235e-05, + "clip_ratio/low_min": 1.1000354334100848e-05, + "clip_ratio/region_mean": 6.616043401663774e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15988.0, + "completions/mean_length": 6809.1640625, + "completions/mean_terminated_length": 6500.29833984375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 1.050546184182167, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00162694591563195, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 183332242.0, + "reward": 0.421875, + "reward_std": 0.33616161346435547, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000290870666504, + "sampling/importance_sampling_ratio/min": 4.244970114086755e-06, + "sampling/sampling_logp_difference/max": 12.369775772094727, + "sampling/sampling_logp_difference/mean": 0.021866722032427788, + "step": 232 + }, + { + "clip_ratio/high_max": 1.4678411844215589e-05, + "clip_ratio/high_mean": 3.669602961053897e-06, + "clip_ratio/low_mean": 2.4373607971028832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8043211159456405e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6815.5, + "completions/mean_terminated_length": 6506.83837890625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 1.060033954679966, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024887355975806713, + "learning_rate": 1e-05, + "loss": 0.1059, + "num_tokens": 184225138.0, + "reward": 0.328125, + "reward_std": 0.2869548499584198, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999393820762634, + "sampling/importance_sampling_ratio/min": 0.00012930770753882825, + "sampling/sampling_logp_difference/max": 8.953315734863281, + "sampling/sampling_logp_difference/mean": 0.02019432932138443, + "step": 233 + }, + { + "clip_ratio/high_max": 7.910891326901037e-06, + "clip_ratio/high_mean": 1.9777228317252593e-06, + "clip_ratio/low_mean": 3.8802519611635944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.078024221598753e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6928.4453125, + "completions/mean_terminated_length": 6623.42724609375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9051575735211372, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002783838426694274, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 185136323.0, + "reward": 0.3359375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05, + "sampling/sampling_logp_difference/max": 11.498395919799805, + "sampling/sampling_logp_difference/mean": 0.01905050128698349, + "step": 234 + }, + { + "clip_ratio/high_max": 4.399394583742833e-06, + "clip_ratio/high_mean": 1.0998486459357082e-06, + "clip_ratio/low_mean": 1.733424267058581e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8434091430208355e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14703.0, + "completions/mean_length": 7155.1328125, + "completions/mean_terminated_length": 7082.46435546875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 1.0119014978408813, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002105508930981159, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 186071324.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999904990196228, + "sampling/importance_sampling_ratio/min": 0.003494206117466092, + "sampling/sampling_logp_difference/max": 5.656649112701416, + "sampling/sampling_logp_difference/mean": 0.020860780030488968, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0561529961705673e-05, + "clip_ratio/high_mean": 3.4390433256703545e-06, + "clip_ratio/low_mean": 2.8499469067355676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193851205196552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16176.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7463.2421875, + "completions/mean_terminated_length": 7463.2421875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.9983502700924873, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013582308311015368, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 187045035.0, + "reward": 0.3984375, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 0.000473080639494583, + "sampling/sampling_logp_difference/max": 7.65624475479126, + "sampling/sampling_logp_difference/mean": 0.021131811663508415, + "step": 236 + }, + { + "clip_ratio/high_max": 8.509013468938065e-06, + "clip_ratio/high_mean": 2.127253367234516e-06, + "clip_ratio/low_mean": 3.985050443588989e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.197775751890731e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14938.0, + "completions/mean_length": 6460.984375, + "completions/mean_terminated_length": 6382.8505859375, + "completions/min_length": 1747.0, + "completions/min_terminated_length": 1747.0, + "entropy": 0.7869217246770859, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002681629965081811, + "learning_rate": 1e-05, + "loss": 0.0987, + "num_tokens": 187889609.0, + "reward": 0.5234375, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0015037209959700704, + "sampling/sampling_logp_difference/max": 6.499812602996826, + "sampling/sampling_logp_difference/mean": 0.016937749460339546, + "step": 237 + }, + { + "clip_ratio/high_max": 1.2362176221358823e-05, + "clip_ratio/high_mean": 3.0905440553397057e-06, + "clip_ratio/low_mean": 5.0333514764133724e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342405825103924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 6241.78125, + "completions/mean_terminated_length": 6161.92138671875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.0217387825250626, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021239183843135834, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 188706605.0, + "reward": 0.2578125, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.004853047896176577, + "sampling/sampling_logp_difference/max": 5.328148365020752, + "sampling/sampling_logp_difference/mean": 0.02103862166404724, + "step": 238 + }, + { + "clip_ratio/high_max": 6.725130333506968e-06, + "clip_ratio/high_mean": 1.681282583376742e-06, + "clip_ratio/low_mean": 3.437372129155847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.605500387493521e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15332.0, + "completions/mean_length": 5638.1328125, + "completions/mean_terminated_length": 5553.51953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.7844365313649178, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023868419229984283, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 189446294.0, + "reward": 0.515625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0008047468145377934, + "sampling/sampling_logp_difference/max": 7.124982833862305, + "sampling/sampling_logp_difference/mean": 0.017401430755853653, + "step": 239 + }, + { + "clip_ratio/high_max": 2.887730215661577e-05, + "clip_ratio/high_mean": 7.219325539153942e-06, + "clip_ratio/low_mean": 2.826443028425274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.548375502759882e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16196.0, + "completions/mean_length": 6374.8046875, + "completions/mean_terminated_length": 6215.9287109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9472770467400551, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027549315709620714, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 190281461.0, + "reward": 0.3984375, + "reward_std": 0.3167053163051605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998682737350464, + "sampling/importance_sampling_ratio/min": 7.100860239006579e-05, + "sampling/sampling_logp_difference/max": 9.552709579467773, + "sampling/sampling_logp_difference/mean": 0.020243138074874878, + "step": 240 + }, + { + "clip_ratio/high_max": 1.586787766427733e-05, + "clip_ratio/high_mean": 3.9669694160693325e-06, + "clip_ratio/low_mean": 2.978218674343225e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.374915604581474e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15097.0, + "completions/mean_length": 6654.21875, + "completions/mean_terminated_length": 6499.88134765625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 1.0028243213891983, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013344973558560014, + "learning_rate": 1e-05, + "loss": 0.0184, + "num_tokens": 191156249.0, + "reward": 0.359375, + "reward_std": 0.22832971811294556, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 0.0021875568199902773, + "sampling/sampling_logp_difference/max": 6.124969959259033, + "sampling/sampling_logp_difference/mean": 0.020470600575208664, + "step": 241 + }, + { + "clip_ratio/high_max": 1.681529829511419e-05, + "clip_ratio/high_mean": 4.9954849146160996e-06, + "clip_ratio/low_mean": 2.040554932136729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5401033553862362e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6767.7890625, + "completions/mean_terminated_length": 6537.00048828125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9059296399354935, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016136945923790336, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 192040526.0, + "reward": 0.4921875, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05, + "sampling/sampling_logp_difference/max": 11.29355525970459, + "sampling/sampling_logp_difference/mean": 0.020058143883943558, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9821966563758906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9821966563758906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16275.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 6767.4921875, + "completions/mean_terminated_length": 6767.4921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 1.0446822568774223, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002869367366656661, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 192926469.0, + "reward": 0.3828125, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586343765259, + "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05, + "sampling/sampling_logp_difference/max": 10.853924751281738, + "sampling/sampling_logp_difference/mean": 0.021512050181627274, + "step": 243 + }, + { + "clip_ratio/high_max": 3.44581130775623e-05, + "clip_ratio/high_mean": 1.3001711295146379e-05, + "clip_ratio/low_mean": 3.6407937841431703e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.940964981869911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16261.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 5738.484375, + "completions/mean_terminated_length": 5738.484375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.8617956340312958, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002177527640014887, + "learning_rate": 1e-05, + "loss": -0.0189, + "num_tokens": 193678859.0, + "reward": 0.5546875, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.0008533780346624553, + "sampling/sampling_logp_difference/max": 7.06630802154541, + "sampling/sampling_logp_difference/mean": 0.018141131848096848, + "step": 244 + }, + { + "clip_ratio/high_max": 3.861003733618418e-06, + "clip_ratio/high_mean": 9.652509334046044e-07, + "clip_ratio/low_mean": 2.7767115511778684e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8732366558870126e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 6382.90625, + "completions/mean_terminated_length": 5976.357421875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.8692388981580734, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004127771593630314, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 194511847.0, + "reward": 0.4140625, + "reward_std": 0.2767002582550049, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06, + "sampling/sampling_logp_difference/max": 12.124691009521484, + "sampling/sampling_logp_difference/mean": 0.018376430496573448, + "step": 245 + }, + { + "clip_ratio/high_max": 9.728395525598899e-06, + "clip_ratio/high_mean": 2.4320988813997246e-06, + "clip_ratio/low_mean": 5.3631663831765763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.606376271316549e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14504.0, + "completions/max_terminated_length": 14504.0, + "completions/mean_length": 5776.15625, + "completions/mean_terminated_length": 5776.15625, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 1.1195004731416702, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00263008801266551, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 195270051.0, + "reward": 0.421875, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971866607666, + "sampling/importance_sampling_ratio/min": 0.005209421273320913, + "sampling/sampling_logp_difference/max": 5.257286548614502, + "sampling/sampling_logp_difference/mean": 0.019923292100429535, + "step": 246 + }, + { + "clip_ratio/high_max": 1.2701100786216557e-05, + "clip_ratio/high_mean": 3.1752751965541393e-06, + "clip_ratio/low_mean": 4.2162768181697174e-05, + "clip_ratio/low_min": 3.873926743835909e-06, + "clip_ratio/region_mean": 4.5338043378251314e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 7411.421875, + "completions/mean_terminated_length": 7196.08056640625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.9801053553819656, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002642859937623143, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 196240913.0, + "reward": 0.390625, + "reward_std": 0.27328529953956604, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999198913574219, + "sampling/importance_sampling_ratio/min": 0.00017500204558018595, + "sampling/sampling_logp_difference/max": 8.650712966918945, + "sampling/sampling_logp_difference/mean": 0.021511007100343704, + "step": 247 + }, + { + "clip_ratio/high_max": 1.5122936929401476e-05, + "clip_ratio/high_mean": 3.780734232350369e-06, + "clip_ratio/low_mean": 6.367217611114029e-05, + "clip_ratio/low_min": 4.8010447244450916e-06, + "clip_ratio/region_mean": 6.745291057086433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16127.0, + "completions/mean_length": 7944.65625, + "completions/mean_terminated_length": 7742.1123046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 1.0132562816143036, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002439325675368309, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 197278517.0, + "reward": 0.34375, + "reward_std": 0.3161812424659729, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05, + "sampling/sampling_logp_difference/max": 11.49896240234375, + "sampling/sampling_logp_difference/mean": 0.02124868705868721, + "step": 248 + }, + { + "clip_ratio/high_max": 2.6017536356448545e-05, + "clip_ratio/high_mean": 6.504384089112136e-06, + "clip_ratio/low_mean": 3.7791321346958284e-05, + "clip_ratio/low_min": 3.2110563097376144e-06, + "clip_ratio/region_mean": 4.429570503816649e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 7550.0, + "completions/mean_terminated_length": 7409.7783203125, + "completions/min_length": 1469.0, + "completions/min_terminated_length": 1469.0, + "entropy": 1.0384011715650558, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014879995724186301, + "learning_rate": 1e-05, + "loss": 0.0338, + "num_tokens": 198265589.0, + "reward": 0.3359375, + "reward_std": 0.24040167033672333, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468922615051, + "sampling/importance_sampling_ratio/min": 8.418659126618877e-05, + "sampling/sampling_logp_difference/max": 9.382474899291992, + "sampling/sampling_logp_difference/mean": 0.021503347903490067, + "step": 249 + }, + { + "clip_ratio/high_max": 1.3615457191917812e-05, + "clip_ratio/high_mean": 4.491880531531933e-06, + "clip_ratio/low_mean": 3.916533574965797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.365721684962409e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 8140.9140625, + "completions/mean_terminated_length": 7517.48779296875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.8718572407960892, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002340668346732855, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 199324938.0, + "reward": 0.453125, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.002325017238035798, + "sampling/sampling_logp_difference/max": 6.064027786254883, + "sampling/sampling_logp_difference/mean": 0.019466478377580643, + "step": 250 + }, + { + "clip_ratio/high_max": 2.2175697040438536e-05, + "clip_ratio/high_mean": 5.543924260109634e-06, + "clip_ratio/low_mean": 4.1318608055007644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.686253225827386e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16263.0, + "completions/mean_length": 6630.96875, + "completions/mean_terminated_length": 6396.896484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.7798146530985832, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001989356242120266, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 200189902.0, + "reward": 0.5625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474883079529, + "sampling/importance_sampling_ratio/min": 0.0003315774374641478, + "sampling/sampling_logp_difference/max": 8.011649131774902, + "sampling/sampling_logp_difference/mean": 0.01849902793765068, + "step": 251 + }, + { + "clip_ratio/high_max": 3.325706302348408e-06, + "clip_ratio/high_mean": 8.31426575587102e-07, + "clip_ratio/low_mean": 2.0285911205064622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.111733795118198e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15357.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 6582.203125, + "completions/mean_terminated_length": 6582.203125, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 1.0181676000356674, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002594445599243045, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 201052832.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.0003853558446280658, + "sampling/sampling_logp_difference/max": 7.8613433837890625, + "sampling/sampling_logp_difference/mean": 0.021598614752292633, + "step": 252 + }, + { + "clip_ratio/high_max": 2.2044430352252675e-05, + "clip_ratio/high_mean": 5.511107588063169e-06, + "clip_ratio/low_mean": 3.4155824209847196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96669319115972e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14540.0, + "completions/max_terminated_length": 14540.0, + "completions/mean_length": 6145.1796875, + "completions/mean_terminated_length": 6145.1796875, + "completions/min_length": 1098.0, + "completions/min_terminated_length": 1098.0, + "entropy": 0.9084350541234016, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003104996867477894, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 201858047.0, + "reward": 0.5078125, + "reward_std": 0.33220985531806946, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.007650630082935095, + "sampling/sampling_logp_difference/max": 4.87296724319458, + "sampling/sampling_logp_difference/mean": 0.018979094922542572, + "step": 253 + }, + { + "clip_ratio/high_max": 2.9959978519400465e-05, + "clip_ratio/high_mean": 7.489994629850116e-06, + "clip_ratio/low_mean": 3.5255963325653283e-05, + "clip_ratio/low_min": 2.973075879708631e-06, + "clip_ratio/region_mean": 4.274595892184152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15745.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7259.953125, + "completions/mean_terminated_length": 7259.953125, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 0.9823614731431007, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003212577663362026, + "learning_rate": 1e-05, + "loss": 0.0133, + "num_tokens": 202807673.0, + "reward": 0.4765625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999860405921936, + "sampling/importance_sampling_ratio/min": 0.000536504783667624, + "sampling/sampling_logp_difference/max": 7.530435085296631, + "sampling/sampling_logp_difference/mean": 0.021432969719171524, + "step": 254 + }, + { + "clip_ratio/high_max": 3.273996276220714e-05, + "clip_ratio/high_mean": 9.095591565255745e-06, + "clip_ratio/low_mean": 2.9539680099333054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8635271948805894e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7258.71875, + "completions/mean_terminated_length": 7113.87353515625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8823810070753098, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001418307889252901, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 203757333.0, + "reward": 0.40625, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884963035583, + "sampling/importance_sampling_ratio/min": 0.0006408974295482039, + "sampling/sampling_logp_difference/max": 7.3526411056518555, + "sampling/sampling_logp_difference/mean": 0.019296500831842422, + "step": 255 + }, + { + "clip_ratio/high_max": 1.544119368190877e-05, + "clip_ratio/high_mean": 3.860298420477193e-06, + "clip_ratio/low_mean": 3.755458698151415e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.141488631148604e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7011.40625, + "completions/mean_terminated_length": 6386.56689453125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8057166337966919, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001652427832596004, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 204675065.0, + "reward": 0.46875, + "reward_std": 0.24146251380443573, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.015319154597818851, + "sampling/sampling_logp_difference/max": 4.178651332855225, + "sampling/sampling_logp_difference/mean": 0.018787402659654617, + "step": 256 + }, + { + "clip_ratio/high_max": 5.222041181696113e-06, + "clip_ratio/high_mean": 2.209917965956265e-06, + "clip_ratio/low_mean": 4.0701652551433654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.291157006264257e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14796.0, + "completions/max_terminated_length": 14796.0, + "completions/mean_length": 6243.4296875, + "completions/mean_terminated_length": 6243.4296875, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "entropy": 0.9856048971414566, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001482579973526299, + "learning_rate": 1e-05, + "loss": 0.0677, + "num_tokens": 205494344.0, + "reward": 0.5390625, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998942613601685, + "sampling/importance_sampling_ratio/min": 0.0004254466330166906, + "sampling/sampling_logp_difference/max": 7.762371063232422, + "sampling/sampling_logp_difference/mean": 0.019727632403373718, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 6.842733455414418e-05, + "clip_ratio/low_min": 9.297655878981459e-06, + "clip_ratio/region_mean": 6.842733455414418e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 7122.2421875, + "completions/mean_terminated_length": 6586.4375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.8625433370471001, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002006452763453126, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 206428775.0, + "reward": 0.40625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338388442993, + "sampling/importance_sampling_ratio/min": 0.00010911409481195733, + "sampling/sampling_logp_difference/max": 9.123116493225098, + "sampling/sampling_logp_difference/mean": 0.01927522011101246, + "step": 258 + }, + { + "clip_ratio/high_max": 2.887607206503162e-05, + "clip_ratio/high_mean": 7.219018016257905e-06, + "clip_ratio/low_mean": 2.7790995090981596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.501001378936053e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15254.0, + "completions/mean_length": 7965.2734375, + "completions/mean_terminated_length": 7623.6826171875, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 1.0068430602550507, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0029176415409892797, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 207469586.0, + "reward": 0.3828125, + "reward_std": 0.2212003916501999, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998945593833923, + "sampling/importance_sampling_ratio/min": 4.06005028708023e-06, + "sampling/sampling_logp_difference/max": 12.414315223693848, + "sampling/sampling_logp_difference/mean": 0.02198987640440464, + "step": 259 + }, + { + "clip_ratio/high_max": 8.710998599781306e-06, + "clip_ratio/high_mean": 2.1777496499453264e-06, + "clip_ratio/low_mean": 4.1899779091636447e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407752874158177e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6329.4296875, + "completions/mean_terminated_length": 6169.83349609375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.9399363100528717, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019115234026685357, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 208300217.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000567436218262, + "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05, + "sampling/sampling_logp_difference/max": 10.749804496765137, + "sampling/sampling_logp_difference/mean": 0.020002204924821854, + "step": 260 + }, + { + "clip_ratio/high_max": 2.536784450057894e-05, + "clip_ratio/high_mean": 6.341961125144735e-06, + "clip_ratio/low_mean": 5.959111433639919e-05, + "clip_ratio/low_min": 1.1521060741870315e-05, + "clip_ratio/region_mean": 6.593307591629127e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15635.0, + "completions/mean_length": 6747.90625, + "completions/mean_terminated_length": 6594.95263671875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.9575144425034523, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003766207257285714, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 209181077.0, + "reward": 0.4375, + "reward_std": 0.3164137303829193, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313354492188, + "sampling/importance_sampling_ratio/min": 1.250743298442103e-05, + "sampling/sampling_logp_difference/max": 11.28918743133545, + "sampling/sampling_logp_difference/mean": 0.020067427307367325, + "step": 261 + }, + { + "clip_ratio/high_max": 2.0626074274332495e-05, + "clip_ratio/high_mean": 5.156518568583124e-06, + "clip_ratio/low_mean": 5.808068385704246e-05, + "clip_ratio/low_min": 1.0360539818066172e-05, + "clip_ratio/region_mean": 6.32372018571914e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 6426.6953125, + "completions/mean_terminated_length": 6348.29150390625, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.87480478733778, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002375675830990076, + "learning_rate": 1e-05, + "loss": 0.0752, + "num_tokens": 210023702.0, + "reward": 0.5078125, + "reward_std": 0.38900789618492126, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999383687973022, + "sampling/importance_sampling_ratio/min": 0.00024259372730739415, + "sampling/sampling_logp_difference/max": 8.324122428894043, + "sampling/sampling_logp_difference/mean": 0.018864646553993225, + "step": 262 + }, + { + "clip_ratio/high_max": 4.462851393327583e-06, + "clip_ratio/high_mean": 1.1157128483318957e-06, + "clip_ratio/low_mean": 3.8966268334661436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.008198141036701e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7223.1484375, + "completions/mean_terminated_length": 6927.63671875, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 1.0218688547611237, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016257674433290958, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 210969921.0, + "reward": 0.4609375, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 9.193710138788447e-05, + "sampling/sampling_logp_difference/max": 9.294405937194824, + "sampling/sampling_logp_difference/mean": 0.02119653858244419, + "step": 263 + }, + { + "clip_ratio/high_max": 1.2653464409595472e-05, + "clip_ratio/high_mean": 3.163366102398868e-06, + "clip_ratio/low_mean": 4.864477250521304e-05, + "clip_ratio/low_min": 8.641252861707471e-06, + "clip_ratio/region_mean": 5.1808138323394815e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15180.0, + "completions/max_terminated_length": 15180.0, + "completions/mean_length": 6974.0703125, + "completions/mean_terminated_length": 6974.0703125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9265539348125458, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023448490537703037, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 211884866.0, + "reward": 0.390625, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.0007677432149648666, + "sampling/sampling_logp_difference/max": 7.172055244445801, + "sampling/sampling_logp_difference/mean": 0.020384611561894417, + "step": 264 + }, + { + "clip_ratio/high_max": 1.1967917316724197e-05, + "clip_ratio/high_mean": 2.9919793291810493e-06, + "clip_ratio/low_mean": 3.179497366545547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.478695157355105e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 7247.2734375, + "completions/mean_terminated_length": 7027.9921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.9756898358464241, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003212807234376669, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 212833933.0, + "reward": 0.328125, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.001600456889718771, + "sampling/sampling_logp_difference/max": 6.437466144561768, + "sampling/sampling_logp_difference/mean": 0.0199666079133749, + "step": 265 + }, + { + "clip_ratio/high_max": 1.1404694760130951e-05, + "clip_ratio/high_mean": 3.887520392709121e-06, + "clip_ratio/low_mean": 4.0242122167910566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4129643583801226e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15575.0, + "completions/mean_length": 7773.9296875, + "completions/mean_terminated_length": 7423.9267578125, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "entropy": 0.9765531942248344, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019600428640842438, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 213848508.0, + "reward": 0.3984375, + "reward_std": 0.3129909336566925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 2.430168751743622e-05, + "sampling/sampling_logp_difference/max": 10.624964714050293, + "sampling/sampling_logp_difference/mean": 0.020565161481499672, + "step": 266 + }, + { + "clip_ratio/high_max": 6.725708999510971e-06, + "clip_ratio/high_mean": 1.6814272498777427e-06, + "clip_ratio/low_mean": 2.869901106805628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0380438261090603e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15583.0, + "completions/mean_length": 6722.5, + "completions/mean_terminated_length": 6569.14306640625, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "entropy": 0.9291529878973961, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014550165506079793, + "learning_rate": 1e-05, + "loss": 0.0235, + "num_tokens": 214731180.0, + "reward": 0.4921875, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.007400285452604294, + "sampling/sampling_logp_difference/max": 4.90623664855957, + "sampling/sampling_logp_difference/mean": 0.020057080313563347, + "step": 267 + }, + { + "clip_ratio/high_max": 1.8797170469042612e-05, + "clip_ratio/high_mean": 6.827749643889547e-06, + "clip_ratio/low_mean": 3.448591337473772e-05, + "clip_ratio/low_min": 4.687090040533803e-06, + "clip_ratio/region_mean": 4.1313662677566754e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15797.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 7001.8671875, + "completions/mean_terminated_length": 7001.8671875, + "completions/min_length": 930.0, + "completions/min_terminated_length": 930.0, + "entropy": 1.0746883526444435, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002483292715623975, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 215645819.0, + "reward": 0.3515625, + "reward_std": 0.32955142855644226, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05, + "sampling/sampling_logp_difference/max": 11.493552207946777, + "sampling/sampling_logp_difference/mean": 0.020808640867471695, + "step": 268 + }, + { + "clip_ratio/high_max": 8.748068921704544e-06, + "clip_ratio/high_mean": 2.187017230426136e-06, + "clip_ratio/low_mean": 8.762007928453386e-05, + "clip_ratio/low_min": 2.3698836685071e-05, + "clip_ratio/region_mean": 8.980709480965743e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14189.0, + "completions/mean_length": 6663.796875, + "completions/mean_terminated_length": 6509.50830078125, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 1148.0, + "entropy": 1.0000900849699974, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0015696679474785924, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 216519369.0, + "reward": 0.3671875, + "reward_std": 0.3214311897754669, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997707605361938, + "sampling/importance_sampling_ratio/min": 1.288027192458685e-06, + "sampling/sampling_logp_difference/max": 13.562398910522461, + "sampling/sampling_logp_difference/mean": 0.022182684391736984, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.567897690321843e-05, + "clip_ratio/low_min": 3.287224444648018e-06, + "clip_ratio/region_mean": 4.567897690321843e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 6978.7421875, + "completions/mean_terminated_length": 6829.45263671875, + "completions/min_length": 1661.0, + "completions/min_terminated_length": 1661.0, + "entropy": 1.0845019966363907, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003887100610882044, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 217432432.0, + "reward": 0.3671875, + "reward_std": 0.3124619722366333, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999902248382568, + "sampling/importance_sampling_ratio/min": 0.02168075367808342, + "sampling/sampling_logp_difference/max": 3.8313302993774414, + "sampling/sampling_logp_difference/mean": 0.02127157337963581, + "step": 270 + }, + { + "clip_ratio/high_max": 2.444328310957644e-05, + "clip_ratio/high_mean": 6.11082077739411e-06, + "clip_ratio/low_mean": 5.1527222922231886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7638043699625996e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 5903.5546875, + "completions/mean_terminated_length": 5652.0244140625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.8638224303722382, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002851828932762146, + "learning_rate": 1e-05, + "loss": 0.0771, + "num_tokens": 218208399.0, + "reward": 0.4453125, + "reward_std": 0.3713914752006531, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000553131103516, + "sampling/importance_sampling_ratio/min": 0.000626727007329464, + "sampling/sampling_logp_difference/max": 7.374999523162842, + "sampling/sampling_logp_difference/mean": 0.01880766451358795, + "step": 271 + }, + { + "clip_ratio/high_max": 8.474872856822913e-06, + "clip_ratio/high_mean": 2.118718214205728e-06, + "clip_ratio/low_mean": 2.5821682072546537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.794040096887329e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16060.0, + "completions/max_terminated_length": 16060.0, + "completions/mean_length": 5596.7109375, + "completions/mean_terminated_length": 5596.7109375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 1.1127397641539574, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018005800666287541, + "learning_rate": 1e-05, + "loss": 0.0075, + "num_tokens": 218944418.0, + "reward": 0.4375, + "reward_std": 0.29485049843788147, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000048875808716, + "sampling/importance_sampling_ratio/min": 0.01548748929053545, + "sampling/sampling_logp_difference/max": 4.167722702026367, + "sampling/sampling_logp_difference/mean": 0.02004322223365307, + "step": 272 + }, + { + "clip_ratio/high_max": 1.5034628631838132e-05, + "clip_ratio/high_mean": 4.925485768580984e-06, + "clip_ratio/low_mean": 3.539464648838475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.032013237065257e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7120.109375, + "completions/mean_terminated_length": 7047.16552734375, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "entropy": 1.0697019025683403, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022711476776748896, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 219875952.0, + "reward": 0.2734375, + "reward_std": 0.23751862347126007, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000036358833313, + "sampling/importance_sampling_ratio/min": 9.733050683280453e-05, + "sampling/sampling_logp_difference/max": 9.237398147583008, + "sampling/sampling_logp_difference/mean": 0.02110595628619194, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0558468147792155e-05, + "clip_ratio/high_mean": 2.6396170369480387e-06, + "clip_ratio/low_mean": 3.796903268948881e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.060864915800266e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7623.953125, + "completions/mean_terminated_length": 7484.9052734375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.8836525157094002, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002609838731586933, + "learning_rate": 1e-05, + "loss": 0.0563, + "num_tokens": 220871730.0, + "reward": 0.3046875, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999877214431763, + "sampling/importance_sampling_ratio/min": 0.0015448236372321844, + "sampling/sampling_logp_difference/max": 6.472845554351807, + "sampling/sampling_logp_difference/mean": 0.019322458654642105, + "step": 274 + }, + { + "clip_ratio/high_max": 1.144785210271948e-05, + "clip_ratio/high_mean": 2.86196302567987e-06, + "clip_ratio/low_mean": 5.795533934360719e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 6.081730361984228e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15557.0, + "completions/mean_length": 6778.71875, + "completions/mean_terminated_length": 6703.08642578125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8968989998102188, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.00395589042454958, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 221761214.0, + "reward": 0.4921875, + "reward_std": 0.4032142758369446, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000214576721191, + "sampling/importance_sampling_ratio/min": 0.0011724763317033648, + "sampling/sampling_logp_difference/max": 6.7486371994018555, + "sampling/sampling_logp_difference/mean": 0.018937086686491966, + "step": 275 + }, + { + "clip_ratio/high_max": 2.708495139813749e-05, + "clip_ratio/high_mean": 7.628764933542698e-06, + "clip_ratio/low_mean": 3.0297362627607072e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.792612744746293e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7319.2578125, + "completions/mean_terminated_length": 6794.85107421875, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.870811752974987, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002063714899122715, + "learning_rate": 1e-05, + "loss": 0.0271, + "num_tokens": 222719287.0, + "reward": 0.3203125, + "reward_std": 0.2835301160812378, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999525547027588, + "sampling/importance_sampling_ratio/min": 2.13631665246794e-05, + "sampling/sampling_logp_difference/max": 10.7538423538208, + "sampling/sampling_logp_difference/mean": 0.019336167722940445, + "step": 276 + }, + { + "clip_ratio/high_max": 3.860288416035473e-06, + "clip_ratio/high_mean": 9.650721040088683e-07, + "clip_ratio/low_mean": 2.303871349340625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4003785597415117e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6207.4140625, + "completions/mean_terminated_length": 5879.13671875, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.8348869979381561, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023463829420506954, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 223533372.0, + "reward": 0.4375, + "reward_std": 0.2359210103750229, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05, + "sampling/sampling_logp_difference/max": 10.749905586242676, + "sampling/sampling_logp_difference/mean": 0.018392907455563545, + "step": 277 + }, + { + "clip_ratio/high_max": 2.1441665467136772e-05, + "clip_ratio/high_mean": 5.360416366784193e-06, + "clip_ratio/low_mean": 5.504566888703266e-05, + "clip_ratio/low_min": 1.2581466762640048e-05, + "clip_ratio/region_mean": 6.040608514013002e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14713.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 6417.2109375, + "completions/mean_terminated_length": 6417.2109375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "entropy": 1.0232173576951027, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033652919810265303, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 224375711.0, + "reward": 0.390625, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999820590019226, + "sampling/importance_sampling_ratio/min": 0.0020559614058583975, + "sampling/sampling_logp_difference/max": 6.18701171875, + "sampling/sampling_logp_difference/mean": 0.020980924367904663, + "step": 278 + }, + { + "clip_ratio/high_max": 4.679544872487895e-06, + "clip_ratio/high_mean": 1.1698862181219738e-06, + "clip_ratio/low_mean": 2.818696702888701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9356853247008985e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15026.0, + "completions/max_terminated_length": 15026.0, + "completions/mean_length": 5275.9453125, + "completions/mean_terminated_length": 5275.9453125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.8563915193080902, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025473968125879765, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 225070992.0, + "reward": 0.703125, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.0010016229934990406, + "sampling/sampling_logp_difference/max": 6.906133651733398, + "sampling/sampling_logp_difference/mean": 0.018068701028823853, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1973035422415705e-05, + "clip_ratio/low_min": 6.267234766710317e-06, + "clip_ratio/region_mean": 4.1973035422415705e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 7693.984375, + "completions/mean_terminated_length": 7556.0478515625, + "completions/min_length": 1349.0, + "completions/min_terminated_length": 1349.0, + "entropy": 0.7832933664321899, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016663498245179653, + "learning_rate": 1e-05, + "loss": 0.0836, + "num_tokens": 226073822.0, + "reward": 0.421875, + "reward_std": 0.3227166533470154, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999273419380188, + "sampling/importance_sampling_ratio/min": 5.893720299354754e-06, + "sampling/sampling_logp_difference/max": 12.04162311553955, + "sampling/sampling_logp_difference/mean": 0.01851016655564308, + "step": 280 + }, + { + "clip_ratio/high_max": 1.304801662627142e-05, + "clip_ratio/high_mean": 3.262004156567855e-06, + "clip_ratio/low_mean": 3.7096169648975774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035817426029098e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15207.0, + "completions/mean_length": 6222.4609375, + "completions/mean_terminated_length": 6061.1669921875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8835120126605034, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021045261528342962, + "learning_rate": 1e-05, + "loss": 0.055, + "num_tokens": 226888577.0, + "reward": 0.5078125, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616742134094, + "sampling/importance_sampling_ratio/min": 5.688065698450373e-07, + "sampling/sampling_logp_difference/max": 14.379725456237793, + "sampling/sampling_logp_difference/mean": 0.018851105123758316, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1754828114571865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1754828114571865e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6371.625, + "completions/mean_terminated_length": 6131.328125, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.9026313945651054, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030448357574641705, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 227722025.0, + "reward": 0.515625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00016869648243300617, + "sampling/sampling_logp_difference/max": 8.687409400939941, + "sampling/sampling_logp_difference/mean": 0.018757576122879982, + "step": 282 + }, + { + "clip_ratio/high_max": 7.024085562079563e-06, + "clip_ratio/high_mean": 1.7560213905198907e-06, + "clip_ratio/low_mean": 3.379111592494155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5547137599678535e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15891.0, + "completions/mean_length": 7510.4921875, + "completions/mean_terminated_length": 7224.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 1.044313833117485, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019295766251161695, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 228703256.0, + "reward": 0.3046875, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.0002186153142247349, + "sampling/sampling_logp_difference/max": 8.428196907043457, + "sampling/sampling_logp_difference/mean": 0.02207346074283123, + "step": 283 + }, + { + "clip_ratio/high_max": 5.068321115686558e-06, + "clip_ratio/high_mean": 1.2670802789216395e-06, + "clip_ratio/low_mean": 3.7797102550030104e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9064182828951743e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 7594.140625, + "completions/mean_terminated_length": 7524.92919921875, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9706612005829811, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0017117204843088984, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 229697002.0, + "reward": 0.2734375, + "reward_std": 0.18649455904960632, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016212463379, + "sampling/importance_sampling_ratio/min": 0.00035400164779275656, + "sampling/sampling_logp_difference/max": 7.946208953857422, + "sampling/sampling_logp_difference/mean": 0.021097885444760323, + "step": 284 + }, + { + "clip_ratio/high_max": 1.5618601537426002e-05, + "clip_ratio/high_mean": 3.904650384356501e-06, + "clip_ratio/low_mean": 4.570582996166195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.961048034601845e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15562.0, + "completions/mean_length": 6888.9140625, + "completions/mean_terminated_length": 6738.19873046875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.9210037142038345, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025933689903467894, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 230598679.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.0007308972999453545, + "sampling/sampling_logp_difference/max": 7.221237659454346, + "sampling/sampling_logp_difference/mean": 0.01939917542040348, + "step": 285 + }, + { + "clip_ratio/high_max": 2.398964193162101e-05, + "clip_ratio/high_mean": 6.9283565835576155e-06, + "clip_ratio/low_mean": 4.821338916372042e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.514174608833855e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6433.640625, + "completions/mean_terminated_length": 6355.29150390625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 1.064419962465763, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019397985888645053, + "learning_rate": 1e-05, + "loss": 0.0841, + "num_tokens": 231440153.0, + "reward": 0.375, + "reward_std": 0.3451131582260132, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 0.019039930775761604, + "sampling/sampling_logp_difference/max": 3.961216926574707, + "sampling/sampling_logp_difference/mean": 0.021084938198328018, + "step": 286 + }, + { + "clip_ratio/high_max": 1.9223051822336856e-05, + "clip_ratio/high_mean": 6.997284344834043e-06, + "clip_ratio/low_mean": 5.4512621773028513e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.150990611786256e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14304.0, + "completions/mean_length": 5858.40625, + "completions/mean_terminated_length": 5691.33349609375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.8120778575539589, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002288782736286521, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 232209485.0, + "reward": 0.46875, + "reward_std": 0.36637401580810547, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 0.00017959839897230268, + "sampling/sampling_logp_difference/max": 8.624787330627441, + "sampling/sampling_logp_difference/mean": 0.019076552242040634, + "step": 287 + }, + { + "clip_ratio/high_max": 9.900939403451048e-06, + "clip_ratio/high_mean": 3.4680233511608094e-06, + "clip_ratio/low_mean": 1.8137742017643177e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1605765368803986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7088.4765625, + "completions/mean_terminated_length": 6710.609375, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9231890514492989, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001075367210432887, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 233133850.0, + "reward": 0.5078125, + "reward_std": 0.18383610248565674, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998681545257568, + "sampling/importance_sampling_ratio/min": 0.005257915705442429, + "sampling/sampling_logp_difference/max": 5.248020648956299, + "sampling/sampling_logp_difference/mean": 0.019140273332595825, + "step": 288 + }, + { + "clip_ratio/high_max": 8.648456969240215e-06, + "clip_ratio/high_mean": 2.1621142423100537e-06, + "clip_ratio/low_mean": 1.838804723774956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0550161480059614e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 6151.78125, + "completions/mean_terminated_length": 5906.20849609375, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.8585417941212654, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015517349820584059, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 233940718.0, + "reward": 0.46875, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000255107879639, + "sampling/importance_sampling_ratio/min": 7.617311348440126e-05, + "sampling/sampling_logp_difference/max": 9.482501983642578, + "sampling/sampling_logp_difference/mean": 0.019276250153779984, + "step": 289 + }, + { + "clip_ratio/high_max": 1.1416668485253467e-05, + "clip_ratio/high_mean": 3.7661499732166703e-06, + "clip_ratio/low_mean": 2.1342358195397537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5108507770710276e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15133.0, + "completions/mean_length": 7111.2578125, + "completions/mean_terminated_length": 6812.13671875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.9735362678766251, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036829947493970394, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 234872111.0, + "reward": 0.4296875, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999943971633911, + "sampling/importance_sampling_ratio/min": 0.0006535807042382658, + "sampling/sampling_logp_difference/max": 7.333044528961182, + "sampling/sampling_logp_difference/mean": 0.021356046199798584, + "step": 290 + }, + { + "clip_ratio/high_max": 2.2526005068357335e-05, + "clip_ratio/high_mean": 5.631501267089334e-06, + "clip_ratio/low_mean": 3.30086276107977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.864012808207917e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15995.0, + "completions/mean_length": 6787.671875, + "completions/mean_terminated_length": 6478.11279296875, + "completions/min_length": 1404.0, + "completions/min_terminated_length": 1404.0, + "entropy": 0.8856986835598946, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00234629912301898, + "learning_rate": 1e-05, + "loss": 0.0169, + "num_tokens": 235759149.0, + "reward": 0.5390625, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999296069145203, + "sampling/importance_sampling_ratio/min": 0.00035710117663256824, + "sampling/sampling_logp_difference/max": 7.937491416931152, + "sampling/sampling_logp_difference/mean": 0.01950475014746189, + "step": 291 + }, + { + "clip_ratio/high_max": 2.6025282068076194e-05, + "clip_ratio/high_mean": 6.5063205170190486e-06, + "clip_ratio/low_mean": 4.603358706845029e-05, + "clip_ratio/low_min": 4.53654638477019e-06, + "clip_ratio/region_mean": 5.253990843812062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15845.0, + "completions/mean_length": 6757.203125, + "completions/mean_terminated_length": 6604.39697265625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.9217840805649757, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034125701058655977, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 236643319.0, + "reward": 0.3515625, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 6.144329745438881e-06, + "sampling/sampling_logp_difference/max": 11.999980926513672, + "sampling/sampling_logp_difference/mean": 0.020774487406015396, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5210429246035346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5210429246035346e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 6504.4375, + "completions/mean_terminated_length": 6185.74169921875, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 1.126970261335373, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020905097480863333, + "learning_rate": 1e-05, + "loss": 0.0464, + "num_tokens": 237495351.0, + "reward": 0.25, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.0009940610034391284, + "sampling/sampling_logp_difference/max": 6.913712024688721, + "sampling/sampling_logp_difference/mean": 0.023218728601932526, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.5693222053414502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5693222053414502e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15888.0, + "completions/mean_length": 5702.4140625, + "completions/mean_terminated_length": 5446.05615234375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8803137242794037, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002073790645226836, + "learning_rate": 1e-05, + "loss": 0.0066, + "num_tokens": 238251852.0, + "reward": 0.5625, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000054955482483, + "sampling/importance_sampling_ratio/min": 0.016290459781885147, + "sampling/sampling_logp_difference/max": 4.117175579071045, + "sampling/sampling_logp_difference/mean": 0.0185186006128788, + "step": 294 + }, + { + "clip_ratio/high_max": 1.4213665508577833e-05, + "clip_ratio/high_mean": 4.4483959982244414e-06, + "clip_ratio/low_mean": 2.979715202400257e-05, + "clip_ratio/low_min": 4.1597336348786484e-06, + "clip_ratio/region_mean": 3.424554824960069e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 7176.2890625, + "completions/mean_terminated_length": 6801.99169921875, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9554997384548187, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002781527815386653, + "learning_rate": 1e-05, + "loss": 0.0908, + "num_tokens": 239189385.0, + "reward": 0.5078125, + "reward_std": 0.3634958863258362, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999794960021973, + "sampling/importance_sampling_ratio/min": 0.0018711343873292208, + "sampling/sampling_logp_difference/max": 6.281210422515869, + "sampling/sampling_logp_difference/mean": 0.020436719059944153, + "step": 295 + }, + { + "clip_ratio/high_max": 1.2612186310434481e-05, + "clip_ratio/high_mean": 5.171368570699997e-06, + "clip_ratio/low_mean": 4.8968343890010146e-05, + "clip_ratio/low_min": 4.0222671486844774e-06, + "clip_ratio/region_mean": 5.413971166490228e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16020.0, + "completions/mean_length": 7855.578125, + "completions/mean_terminated_length": 7651.2001953125, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9450526610016823, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003132987068966031, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 240217715.0, + "reward": 0.40625, + "reward_std": 0.28512775897979736, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253153800964, + "sampling/importance_sampling_ratio/min": 0.0011438478250056505, + "sampling/sampling_logp_difference/max": 6.773357391357422, + "sampling/sampling_logp_difference/mean": 0.021461743861436844, + "step": 296 + }, + { + "clip_ratio/high_max": 2.172341964978841e-05, + "clip_ratio/high_mean": 6.823271291978017e-06, + "clip_ratio/low_mean": 3.516899266742257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.199226441414794e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14060.0, + "completions/mean_length": 6240.265625, + "completions/mean_terminated_length": 5913.04833984375, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.8811023011803627, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028523094952106476, + "learning_rate": 1e-05, + "loss": 0.015, + "num_tokens": 241035133.0, + "reward": 0.484375, + "reward_std": 0.26143303513526917, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.0006931954412721097, + "sampling/sampling_logp_difference/max": 7.274198532104492, + "sampling/sampling_logp_difference/mean": 0.019493088126182556, + "step": 297 + }, + { + "clip_ratio/high_max": 1.2606601558218244e-05, + "clip_ratio/high_mean": 3.151650389554561e-06, + "clip_ratio/low_mean": 3.768150395444536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.08331545713736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6103.203125, + "completions/mean_terminated_length": 6022.251953125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8766692876815796, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026241440791636705, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 241836479.0, + "reward": 0.453125, + "reward_std": 0.32589423656463623, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925434589386, + "sampling/importance_sampling_ratio/min": 0.00012664205860346556, + "sampling/sampling_logp_difference/max": 8.974145889282227, + "sampling/sampling_logp_difference/mean": 0.01907728984951973, + "step": 298 + }, + { + "clip_ratio/high_max": 1.7400974911652156e-05, + "clip_ratio/high_mean": 4.350243727913039e-06, + "clip_ratio/low_mean": 4.527119426711579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962143839293276e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7711.0703125, + "completions/mean_terminated_length": 7573.4052734375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 1.0770929008722305, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003654222236946225, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 242844376.0, + "reward": 0.3359375, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999839067459106, + "sampling/importance_sampling_ratio/min": 0.0006267472635954618, + "sampling/sampling_logp_difference/max": 7.374967098236084, + "sampling/sampling_logp_difference/mean": 0.022012868896126747, + "step": 299 + }, + { + "clip_ratio/high_max": 1.4325163647299632e-05, + "clip_ratio/high_mean": 3.581290911824908e-06, + "clip_ratio/low_mean": 4.28195745598714e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6400865016948956e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 6616.5546875, + "completions/mean_terminated_length": 6539.6455078125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.8439916148781776, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029195898678153753, + "learning_rate": 1e-05, + "loss": 0.1094, + "num_tokens": 243708479.0, + "reward": 0.453125, + "reward_std": 0.3516485095024109, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 2.189194128732197e-05, + "sampling/sampling_logp_difference/max": 10.729392051696777, + "sampling/sampling_logp_difference/mean": 0.017992788925766945, + "step": 300 + }, + { + "clip_ratio/high_max": 1.848296233220026e-05, + "clip_ratio/high_mean": 4.620740583050065e-06, + "clip_ratio/low_mean": 5.01860952226707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.480683557834709e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15286.0, + "completions/mean_length": 6173.5234375, + "completions/mean_terminated_length": 6093.1259765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.8975192531943321, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017261393368244171, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 244515378.0, + "reward": 0.53125, + "reward_std": 0.3532412052154541, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999533891677856, + "sampling/importance_sampling_ratio/min": 0.000553854217287153, + "sampling/sampling_logp_difference/max": 7.4986090660095215, + "sampling/sampling_logp_difference/mean": 0.019458644092082977, + "step": 301 + }, + { + "clip_ratio/high_max": 4.114005332667148e-05, + "clip_ratio/high_mean": 1.2276760230633954e-05, + "clip_ratio/low_mean": 3.397437080820964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.625113024303573e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16032.0, + "completions/mean_length": 5640.90625, + "completions/mean_terminated_length": 5470.38134765625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.8833519890904427, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018768958980217576, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 245258318.0, + "reward": 0.4609375, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999516606330872, + "sampling/importance_sampling_ratio/min": 0.0071789538487792015, + "sampling/sampling_logp_difference/max": 4.936601638793945, + "sampling/sampling_logp_difference/mean": 0.019646335393190384, + "step": 302 + }, + { + "clip_ratio/high_max": 1.4196921938491869e-05, + "clip_ratio/high_mean": 4.514302474944998e-06, + "clip_ratio/low_mean": 4.4677519781544106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.919182129015098e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16378.0, + "completions/mean_length": 7840.5078125, + "completions/mean_terminated_length": 7564.9111328125, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.9772802665829659, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002617602702230215, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 246280663.0, + "reward": 0.328125, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0008982301224023104, + "sampling/sampling_logp_difference/max": 7.015084266662598, + "sampling/sampling_logp_difference/mean": 0.022171074524521828, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7621316146687604e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7621316146687604e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6316.1015625, + "completions/mean_terminated_length": 6074.47216796875, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.8542795851826668, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0011874900665134192, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 247107604.0, + "reward": 0.3828125, + "reward_std": 0.2227931022644043, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000126361846924, + "sampling/importance_sampling_ratio/min": 0.00015846268797758967, + "sampling/sampling_logp_difference/max": 8.749991416931152, + "sampling/sampling_logp_difference/mean": 0.018691308796405792, + "step": 304 + }, + { + "clip_ratio/high_max": 3.0959752166381804e-06, + "clip_ratio/high_mean": 7.739938041595451e-07, + "clip_ratio/low_mean": 6.0967123090449604e-05, + "clip_ratio/low_min": 2.711407751121442e-05, + "clip_ratio/region_mean": 6.17411176335736e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6568.171875, + "completions/mean_terminated_length": 6412.365234375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.9063890501856804, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002459619427099824, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 247967322.0, + "reward": 0.5, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998743534088135, + "sampling/importance_sampling_ratio/min": 0.012350871227681637, + "sampling/sampling_logp_difference/max": 4.394028663635254, + "sampling/sampling_logp_difference/mean": 0.020134467631578445, + "step": 305 + }, + { + "clip_ratio/high_max": 5.9507838159333915e-06, + "clip_ratio/high_mean": 1.4876959539833479e-06, + "clip_ratio/low_mean": 2.400908408617397e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.549678004015732e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15714.0, + "completions/mean_length": 8182.28125, + "completions/mean_terminated_length": 7635.50048828125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 1.0137704983353615, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016673406353220344, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 249031710.0, + "reward": 0.3359375, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 0.0008049134048633277, + "sampling/sampling_logp_difference/max": 7.1247758865356445, + "sampling/sampling_logp_difference/mean": 0.021704845130443573, + "step": 306 + }, + { + "clip_ratio/high_max": 1.4527202438330278e-05, + "clip_ratio/high_mean": 3.6318006095825695e-06, + "clip_ratio/low_mean": 3.1829216595724574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5461017205307144e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14502.0, + "completions/max_terminated_length": 14502.0, + "completions/mean_length": 6460.5703125, + "completions/mean_terminated_length": 6460.5703125, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 1.0418165400624275, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022682021372020245, + "learning_rate": 1e-05, + "loss": 0.0171, + "num_tokens": 249881047.0, + "reward": 0.359375, + "reward_std": 0.25566887855529785, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999744296073914, + "sampling/importance_sampling_ratio/min": 0.002809183904901147, + "sampling/sampling_logp_difference/max": 5.874861240386963, + "sampling/sampling_logp_difference/mean": 0.02204791083931923, + "step": 307 + }, + { + "clip_ratio/high_max": 9.222687367582694e-06, + "clip_ratio/high_mean": 4.125313353142701e-06, + "clip_ratio/low_mean": 4.836107154915226e-05, + "clip_ratio/low_min": 3.4611657611094415e-06, + "clip_ratio/region_mean": 5.248638444754761e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14754.0, + "completions/mean_length": 6846.3046875, + "completions/mean_terminated_length": 6694.9130859375, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "entropy": 0.9839218333363533, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002436346374452114, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 250773806.0, + "reward": 0.484375, + "reward_std": 0.34299150109291077, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 0.0257408544421196, + "sampling/sampling_logp_difference/max": 3.6596758365631104, + "sampling/sampling_logp_difference/mean": 0.02135510742664337, + "step": 308 + }, + { + "clip_ratio/high_max": 1.3327621218195418e-05, + "clip_ratio/high_mean": 3.3319053045488545e-06, + "clip_ratio/low_mean": 3.791964286392613e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1251548054788145e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15777.0, + "completions/mean_length": 6558.53125, + "completions/mean_terminated_length": 6241.58056640625, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.7833076938986778, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002015948062762618, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 251633074.0, + "reward": 0.46875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05, + "sampling/sampling_logp_difference/max": 9.874995231628418, + "sampling/sampling_logp_difference/mean": 0.017078280448913574, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.3865982686620555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3865982686620555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7626.390625, + "completions/mean_terminated_length": 7487.38134765625, + "completions/min_length": 1400.0, + "completions/min_terminated_length": 1400.0, + "entropy": 0.8946382254362106, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001098336186259985, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 252629300.0, + "reward": 0.3359375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 0.00021643216314259917, + "sampling/sampling_logp_difference/max": 8.438233375549316, + "sampling/sampling_logp_difference/mean": 0.01972624473273754, + "step": 310 + }, + { + "clip_ratio/high_max": 6.5777783220255515e-06, + "clip_ratio/high_mean": 1.6444445805063879e-06, + "clip_ratio/low_mean": 1.7658890669736138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9303335250242526e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15763.0, + "completions/mean_length": 5796.984375, + "completions/mean_terminated_length": 5713.6220703125, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.969724528491497, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003871417138725519, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 253389562.0, + "reward": 0.484375, + "reward_std": 0.23752351105213165, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998880624771118, + "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05, + "sampling/sampling_logp_difference/max": 10.624960899353027, + "sampling/sampling_logp_difference/mean": 0.019220752641558647, + "step": 311 + }, + { + "clip_ratio/high_max": 8.099077376755304e-06, + "clip_ratio/high_mean": 2.8300572125772305e-06, + "clip_ratio/low_mean": 3.2033483023496956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.486354006554393e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15769.0, + "completions/mean_length": 6938.5625, + "completions/mean_terminated_length": 6788.63525390625, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9812447279691696, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002257548039779067, + "learning_rate": 1e-05, + "loss": -0.0089, + "num_tokens": 254295858.0, + "reward": 0.4140625, + "reward_std": 0.2596206068992615, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000464916229248, + "sampling/importance_sampling_ratio/min": 0.0009388317703269422, + "sampling/sampling_logp_difference/max": 6.970874309539795, + "sampling/sampling_logp_difference/mean": 0.02080199122428894, + "step": 312 + }, + { + "clip_ratio/high_max": 4.441917553776875e-06, + "clip_ratio/high_mean": 1.1104793884442188e-06, + "clip_ratio/low_mean": 3.414505465570983e-05, + "clip_ratio/low_min": 3.790060873143375e-06, + "clip_ratio/region_mean": 3.5255534044154047e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15163.0, + "completions/mean_length": 6878.15625, + "completions/mean_terminated_length": 6650.01611328125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9106859937310219, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00420041661709547, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 255197110.0, + "reward": 0.421875, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999925494194031, + "sampling/importance_sampling_ratio/min": 0.015217061154544353, + "sampling/sampling_logp_difference/max": 4.185338020324707, + "sampling/sampling_logp_difference/mean": 0.02016574889421463, + "step": 313 + }, + { + "clip_ratio/high_max": 8.814751254249131e-06, + "clip_ratio/high_mean": 2.203687813562283e-06, + "clip_ratio/low_mean": 3.137724206681014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3580929766685585e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 6260.2578125, + "completions/mean_terminated_length": 6260.2578125, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.9523455575108528, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027907798066735268, + "learning_rate": 1e-05, + "loss": 0.0302, + "num_tokens": 256018935.0, + "reward": 0.421875, + "reward_std": 0.2659186124801636, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 7.485197420464829e-05, + "sampling/sampling_logp_difference/max": 9.499998092651367, + "sampling/sampling_logp_difference/mean": 0.0191945917904377, + "step": 314 + }, + { + "clip_ratio/high_max": 2.8685263259831117e-05, + "clip_ratio/high_mean": 7.171315814957779e-06, + "clip_ratio/low_mean": 2.780131131885355e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.497262770224552e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6202.828125, + "completions/mean_terminated_length": 6041.22265625, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.8513326346874237, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023744129575788975, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 256841129.0, + "reward": 0.5625, + "reward_std": 0.32407689094543457, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000146627426147, + "sampling/importance_sampling_ratio/min": 9.269781003240496e-06, + "sampling/sampling_logp_difference/max": 11.588750839233398, + "sampling/sampling_logp_difference/mean": 0.019519174471497536, + "step": 315 + }, + { + "clip_ratio/high_max": 1.6381697605538648e-05, + "clip_ratio/high_mean": 4.095424401384662e-06, + "clip_ratio/low_mean": 3.0394592840821133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.449001792432682e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16328.0, + "completions/mean_length": 8019.4609375, + "completions/mean_terminated_length": 7073.90380859375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.9211000874638557, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024705040268599987, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 257884188.0, + "reward": 0.3046875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999315738677979, + "sampling/importance_sampling_ratio/min": 0.016358470544219017, + "sampling/sampling_logp_difference/max": 4.113009452819824, + "sampling/sampling_logp_difference/mean": 0.01984308287501335, + "step": 316 + }, + { + "clip_ratio/high_max": 7.485402420570608e-06, + "clip_ratio/high_mean": 1.871350605142652e-06, + "clip_ratio/low_mean": 3.025547425750119e-05, + "clip_ratio/low_min": 2.697337095014518e-06, + "clip_ratio/region_mean": 3.212682509001752e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15210.0, + "completions/mean_length": 7257.6875, + "completions/mean_terminated_length": 7038.65625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.8801277950406075, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032848953269422054, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 258831852.0, + "reward": 0.4296875, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 0.00019848966621793807, + "sampling/sampling_logp_difference/max": 8.524773597717285, + "sampling/sampling_logp_difference/mean": 0.019743187353014946, + "step": 317 + }, + { + "clip_ratio/high_max": 1.52771035573096e-05, + "clip_ratio/high_mean": 3.8192758893274e-06, + "clip_ratio/low_mean": 3.605492440783564e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.987420052453672e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14903.0, + "completions/mean_length": 6042.84375, + "completions/mean_terminated_length": 5878.69873046875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.8792382404208183, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004201764706522226, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 259623512.0, + "reward": 0.640625, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998612403869629, + "sampling/importance_sampling_ratio/min": 0.00027811730979010463, + "sampling/sampling_logp_difference/max": 8.187467575073242, + "sampling/sampling_logp_difference/mean": 0.018901977688074112, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1642084397608414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1642084397608414e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 7667.6875, + "completions/mean_terminated_length": 7458.49658203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9096411988139153, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014557713875547051, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 260623928.0, + "reward": 0.3515625, + "reward_std": 0.22726887464523315, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0002615761768538505, + "sampling/sampling_logp_difference/max": 8.248785018920898, + "sampling/sampling_logp_difference/mean": 0.01979639381170273, + "step": 319 + }, + { + "clip_ratio/high_max": 2.36019068324822e-05, + "clip_ratio/high_mean": 5.90047670812055e-06, + "clip_ratio/low_mean": 2.704614530557592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2946622809504333e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15514.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 6428.8203125, + "completions/mean_terminated_length": 6428.8203125, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.9974069148302078, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028210312593728304, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 261465625.0, + "reward": 0.46875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000195503234863, + "sampling/importance_sampling_ratio/min": 0.001225265790708363, + "sampling/sampling_logp_difference/max": 6.704597473144531, + "sampling/sampling_logp_difference/mean": 0.021066997200250626, + "step": 320 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 261465625, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-320/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/README.md b/dapo_lora_plus_20251202_001141/checkpoint-384/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/latest b/dapo_lora_plus_20251202_001141/checkpoint-384/latest new file mode 100644 index 0000000000000000000000000000000000000000..47a30b050fc0cf5b9cd367ab63c36191546d4ff7 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/latest @@ -0,0 +1 @@ +global_step384 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6376071922fbdb42eef3a3f5ed4dc7eeb5391aac --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/trainer_state.json @@ -0,0 +1,11938 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3532658693652254, + "eval_steps": 500, + "global_step": 384, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + }, + { + "clip_ratio/high_max": 1.8439853647578275e-05, + "clip_ratio/high_mean": 4.609963411894569e-06, + "clip_ratio/low_mean": 5.708034223061986e-05, + "clip_ratio/low_min": 2.75287948170444e-06, + "clip_ratio/region_mean": 6.169030598357494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15081.0, + "completions/mean_length": 6565.359375, + "completions/mean_terminated_length": 6488.04736328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1013468354940414, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019073591101914644, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 97539453.0, + "reward": 0.2734375, + "reward_std": 0.307217001914978, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999555945396423, + "sampling/importance_sampling_ratio/min": 0.0006022047018632293, + "sampling/sampling_logp_difference/max": 7.414913177490234, + "sampling/sampling_logp_difference/mean": 0.02150837704539299, + "step": 129 + }, + { + "clip_ratio/high_max": 9.068485269381199e-06, + "clip_ratio/high_mean": 2.2671213173452998e-06, + "clip_ratio/low_mean": 1.9822365402433206e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.208948649240483e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 6779.6171875, + "completions/mean_terminated_length": 6703.9921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8940552547574043, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010163087863475084, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 98429036.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 3.464699460664633e-08, + "sampling/sampling_logp_difference/max": 17.178054809570312, + "sampling/sampling_logp_difference/mean": 0.018716152757406235, + "step": 130 + }, + { + "clip_ratio/high_max": 5.047242211730918e-06, + "clip_ratio/high_mean": 1.2618105529327295e-06, + "clip_ratio/low_mean": 2.9014110396019532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0275920835265424e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14549.0, + "completions/max_terminated_length": 14549.0, + "completions/mean_length": 5766.71875, + "completions/mean_terminated_length": 5766.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 1.0455922111868858, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002155766822397709, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 99184264.0, + "reward": 0.4140625, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253749847412, + "sampling/importance_sampling_ratio/min": 0.00010798005678225309, + "sampling/sampling_logp_difference/max": 9.133563995361328, + "sampling/sampling_logp_difference/mean": 0.020948775112628937, + "step": 131 + }, + { + "clip_ratio/high_max": 2.0882574972347356e-05, + "clip_ratio/high_mean": 6.505383225885453e-06, + "clip_ratio/low_mean": 4.496008500609605e-05, + "clip_ratio/low_min": 7.757854064038838e-06, + "clip_ratio/region_mean": 5.1465468231981504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14704.0, + "completions/mean_length": 6167.2421875, + "completions/mean_terminated_length": 6005.07177734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.9100174158811569, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021464223973453045, + "learning_rate": 1e-05, + "loss": -0.0279, + "num_tokens": 99996831.0, + "reward": 0.421875, + "reward_std": 0.3916535973548889, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240040779114, + "sampling/importance_sampling_ratio/min": 0.02249590866267681, + "sampling/sampling_logp_difference/max": 3.794421911239624, + "sampling/sampling_logp_difference/mean": 0.01866895705461502, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0998018473837874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0998018473837874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15738.0, + "completions/mean_length": 6242.9453125, + "completions/mean_terminated_length": 6163.09423828125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8624134212732315, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023277695290744305, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 100814112.0, + "reward": 0.3984375, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999959409236908, + "sampling/importance_sampling_ratio/min": 0.0002393616596236825, + "sampling/sampling_logp_difference/max": 8.33753490447998, + "sampling/sampling_logp_difference/mean": 0.0191188994795084, + "step": 133 + }, + { + "clip_ratio/high_max": 6.589872555196052e-06, + "clip_ratio/high_mean": 1.647468138799013e-06, + "clip_ratio/low_mean": 4.329304238126497e-05, + "clip_ratio/low_min": 3.5120251595799346e-06, + "clip_ratio/region_mean": 4.494051017900347e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5733.6875, + "completions/mean_terminated_length": 5478.080078125, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9628067463636398, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003547821193933487, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 101566264.0, + "reward": 0.3984375, + "reward_std": 0.36584997177124023, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0001282035664189607, + "sampling/sampling_logp_difference/max": 8.961891174316406, + "sampling/sampling_logp_difference/mean": 0.019646761938929558, + "step": 134 + }, + { + "clip_ratio/high_max": 1.7107527582993498e-05, + "clip_ratio/high_mean": 4.2768818957483745e-06, + "clip_ratio/low_mean": 3.014796902789385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.442485103732906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15848.0, + "completions/max_terminated_length": 15848.0, + "completions/mean_length": 5505.9375, + "completions/mean_terminated_length": 5505.9375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.8041045889258385, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024891747161746025, + "learning_rate": 1e-05, + "loss": 0.1406, + "num_tokens": 102291456.0, + "reward": 0.5, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 0.0014627616619691253, + "sampling/sampling_logp_difference/max": 6.527429103851318, + "sampling/sampling_logp_difference/mean": 0.01716250739991665, + "step": 135 + }, + { + "clip_ratio/high_max": 1.548903105685895e-05, + "clip_ratio/high_mean": 3.872257764214737e-06, + "clip_ratio/low_mean": 5.380711581892683e-05, + "clip_ratio/low_min": 4.5777483137499075e-06, + "clip_ratio/region_mean": 5.767937363998499e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16005.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 5003.0625, + "completions/mean_terminated_length": 5003.0625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.9115714654326439, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00220683915540576, + "learning_rate": 1e-05, + "loss": 0.1361, + "num_tokens": 102949824.0, + "reward": 0.4140625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 8.323705696966499e-05, + "sampling/sampling_logp_difference/max": 9.393817901611328, + "sampling/sampling_logp_difference/mean": 0.018076512962579727, + "step": 136 + }, + { + "clip_ratio/high_max": 2.181136096623959e-05, + "clip_ratio/high_mean": 5.4528402415598975e-06, + "clip_ratio/low_mean": 3.4416837252138066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986967681157694e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15658.0, + "completions/max_terminated_length": 15658.0, + "completions/mean_length": 4742.1328125, + "completions/mean_terminated_length": 4742.1328125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.9430246204137802, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003964806906878948, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 103580913.0, + "reward": 0.4609375, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 7.031940185697749e-05, + "sampling/sampling_logp_difference/max": 9.56246280670166, + "sampling/sampling_logp_difference/mean": 0.019651200622320175, + "step": 137 + }, + { + "clip_ratio/high_max": 4.07684046876966e-06, + "clip_ratio/high_mean": 1.019210117192415e-06, + "clip_ratio/low_mean": 3.8682398553646635e-05, + "clip_ratio/low_min": 8.189203072106466e-06, + "clip_ratio/region_mean": 3.970160832977854e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6574.171875, + "completions/mean_terminated_length": 6091.72119140625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8429529070854187, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002067410387098789, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 104447463.0, + "reward": 0.3125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997583627700806, + "sampling/importance_sampling_ratio/min": 0.00021258489869069308, + "sampling/sampling_logp_difference/max": 8.456169128417969, + "sampling/sampling_logp_difference/mean": 0.018853647634387016, + "step": 138 + }, + { + "clip_ratio/high_max": 1.9725823221961036e-05, + "clip_ratio/high_mean": 4.931455805490259e-06, + "clip_ratio/low_mean": 5.9263072444082354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.419452870431996e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15518.0, + "completions/max_terminated_length": 15518.0, + "completions/mean_length": 4581.5625, + "completions/mean_terminated_length": 4581.5625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.7094272822141647, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004292502999305725, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 105052287.0, + "reward": 0.625, + "reward_std": 0.3908300995826721, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.0019342642044648528, + "sampling/sampling_logp_difference/max": 6.24802827835083, + "sampling/sampling_logp_difference/mean": 0.016310662031173706, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0132298029930098e-05, + "clip_ratio/high_mean": 2.5330745074825245e-06, + "clip_ratio/low_mean": 4.6397121650443296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.893019581686531e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16097.0, + "completions/mean_length": 7066.4453125, + "completions/mean_terminated_length": 6918.5478515625, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8481669947504997, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015785128343850374, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 105977048.0, + "reward": 0.3515625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.00104097044095397, + "sampling/sampling_logp_difference/max": 6.8676018714904785, + "sampling/sampling_logp_difference/mean": 0.018304405733942986, + "step": 140 + }, + { + "clip_ratio/high_max": 1.6989023606583942e-05, + "clip_ratio/high_mean": 4.2472559016459854e-06, + "clip_ratio/low_mean": 2.3075059743860038e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7322315418132348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16104.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 6230.5234375, + "completions/mean_terminated_length": 6230.5234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9658062160015106, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002542720176279545, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 106793187.0, + "reward": 0.3203125, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.0002781494113150984, + "sampling/sampling_logp_difference/max": 8.187352180480957, + "sampling/sampling_logp_difference/mean": 0.019391046836972237, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7597974508353218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7597974508353218e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14216.0, + "completions/mean_length": 5690.5546875, + "completions/mean_terminated_length": 5606.3544921875, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 1.0098655670881271, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001451602904126048, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 107539874.0, + "reward": 0.4296875, + "reward_std": 0.23304283618927002, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999307990074158, + "sampling/importance_sampling_ratio/min": 5.640022671116185e-09, + "sampling/sampling_logp_difference/max": 18.993377685546875, + "sampling/sampling_logp_difference/mean": 0.018607191741466522, + "step": 142 + }, + { + "clip_ratio/high_max": 1.2800467629858758e-05, + "clip_ratio/high_mean": 4.19954119479371e-06, + "clip_ratio/low_mean": 2.350350996493944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.770305115973315e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15791.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5471.1328125, + "completions/mean_terminated_length": 5471.1328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0413162112236023, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023549250327050686, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 108260091.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999832510948181, + "sampling/importance_sampling_ratio/min": 0.0011709182290360332, + "sampling/sampling_logp_difference/max": 6.749967098236084, + "sampling/sampling_logp_difference/mean": 0.020427243784070015, + "step": 143 + }, + { + "clip_ratio/high_max": 2.1983064925734652e-05, + "clip_ratio/high_mean": 5.495766231433663e-06, + "clip_ratio/low_mean": 4.361141452591255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9107180757346214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6211.7421875, + "completions/mean_terminated_length": 6050.2783203125, + "completions/min_length": 622.0, + "completions/min_terminated_length": 622.0, + "entropy": 0.9706784337759018, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017527056625112891, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 109073890.0, + "reward": 0.421875, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.002898645820096135, + "sampling/sampling_logp_difference/max": 5.843511581420898, + "sampling/sampling_logp_difference/mean": 0.018898162990808487, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.208964992358233e-05, + "clip_ratio/low_min": 3.9168990042526275e-06, + "clip_ratio/region_mean": 4.208964992358233e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14880.0, + "completions/mean_length": 6007.8984375, + "completions/mean_terminated_length": 5926.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.1967609524726868, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007858420140109956, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 109861813.0, + "reward": 0.296875, + "reward_std": 0.23486506938934326, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 3.294382011631569e-08, + "sampling/sampling_logp_difference/max": 17.22846221923828, + "sampling/sampling_logp_difference/mean": 0.021845955401659012, + "step": 145 + }, + { + "clip_ratio/high_max": 4.5118208618077915e-06, + "clip_ratio/high_mean": 1.1279552154519479e-06, + "clip_ratio/low_mean": 3.749712686840212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8625082197540905e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6800.9921875, + "completions/mean_terminated_length": 6725.53564453125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.0437887012958527, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029428249690681696, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 110756572.0, + "reward": 0.265625, + "reward_std": 0.3248382806777954, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.0006329434108920395, + "sampling/sampling_logp_difference/max": 7.365129470825195, + "sampling/sampling_logp_difference/mean": 0.02010120078921318, + "step": 146 + }, + { + "clip_ratio/high_max": 1.427700522071973e-05, + "clip_ratio/high_mean": 3.5692513051799324e-06, + "clip_ratio/low_mean": 4.964020990883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320946092979284e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6309.4453125, + "completions/mean_terminated_length": 6230.1181640625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.9768906533718109, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002088683657348156, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 111585493.0, + "reward": 0.375, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.009723234921693802, + "sampling/sampling_logp_difference/max": 4.633236885070801, + "sampling/sampling_logp_difference/mean": 0.020927833393216133, + "step": 147 + }, + { + "clip_ratio/high_max": 5.4841398196003865e-06, + "clip_ratio/high_mean": 1.3710349549000966e-06, + "clip_ratio/low_mean": 5.122006064084417e-05, + "clip_ratio/low_min": 3.785125954891555e-06, + "clip_ratio/region_mean": 5.25910957094311e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15209.0, + "completions/mean_length": 6221.859375, + "completions/mean_terminated_length": 6060.5556640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.9212924689054489, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002406956860795617, + "learning_rate": 1e-05, + "loss": 0.1051, + "num_tokens": 112400363.0, + "reward": 0.40625, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05, + "sampling/sampling_logp_difference/max": 9.74976634979248, + "sampling/sampling_logp_difference/mean": 0.018652018159627914, + "step": 148 + }, + { + "clip_ratio/high_max": 1.4568151755156578e-05, + "clip_ratio/high_mean": 3.6420379387891444e-06, + "clip_ratio/low_mean": 3.999794398623635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3639981413434725e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14997.0, + "completions/mean_length": 6942.8203125, + "completions/mean_terminated_length": 6716.232421875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.949538916349411, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022962254006415606, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 113308748.0, + "reward": 0.375, + "reward_std": 0.3329663872718811, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.00048810525913722813, + "sampling/sampling_logp_difference/max": 7.624979496002197, + "sampling/sampling_logp_difference/mean": 0.01939917355775833, + "step": 149 + }, + { + "clip_ratio/high_max": 8.786732450971613e-06, + "clip_ratio/high_mean": 2.196683112742903e-06, + "clip_ratio/low_mean": 5.562954720517155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7826231113722315e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15182.0, + "completions/mean_length": 6783.1796875, + "completions/mean_terminated_length": 6552.76025390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9774708449840546, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020560629200190306, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 114196235.0, + "reward": 0.34375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998990297317505, + "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07, + "sampling/sampling_logp_difference/max": 15.211536407470703, + "sampling/sampling_logp_difference/mean": 0.019691556692123413, + "step": 150 + }, + { + "clip_ratio/high_max": 1.799483243303257e-05, + "clip_ratio/high_mean": 4.498708108258143e-06, + "clip_ratio/low_mean": 2.6389980291696702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0888688343111426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15549.0, + "completions/mean_length": 5568.15625, + "completions/mean_terminated_length": 5396.4765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.9303529411554337, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022214846685528755, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 114928047.0, + "reward": 0.234375, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05, + "sampling/sampling_logp_difference/max": 10.749968528747559, + "sampling/sampling_logp_difference/mean": 0.01938418298959732, + "step": 151 + }, + { + "clip_ratio/high_max": 1.1957493370573502e-05, + "clip_ratio/high_mean": 2.9893733426433755e-06, + "clip_ratio/low_mean": 5.885063319510664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.184000585562899e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15340.0, + "completions/max_terminated_length": 15340.0, + "completions/mean_length": 6086.578125, + "completions/mean_terminated_length": 6086.578125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.9131873697042465, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002448044717311859, + "learning_rate": 1e-05, + "loss": 0.0599, + "num_tokens": 115725657.0, + "reward": 0.40625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999779462814331, + "sampling/importance_sampling_ratio/min": 0.02929726243019104, + "sampling/sampling_logp_difference/max": 3.530261278152466, + "sampling/sampling_logp_difference/mean": 0.019298439845442772, + "step": 152 + }, + { + "clip_ratio/high_max": 1.3385357760853367e-05, + "clip_ratio/high_mean": 3.3463394402133417e-06, + "clip_ratio/low_mean": 5.717015119444113e-05, + "clip_ratio/low_min": 3.4328400033700746e-06, + "clip_ratio/region_mean": 6.0516490520967636e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 6442.5390625, + "completions/mean_terminated_length": 6203.9443359375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.8959419652819633, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002013204852119088, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 116571478.0, + "reward": 0.2734375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06, + "sampling/sampling_logp_difference/max": 13.778777122497559, + "sampling/sampling_logp_difference/mean": 0.01925014518201351, + "step": 153 + }, + { + "clip_ratio/high_max": 9.34224021875707e-06, + "clip_ratio/high_mean": 3.136903728773177e-06, + "clip_ratio/low_mean": 2.9738095065567904e-05, + "clip_ratio/low_min": 3.7240065466903616e-06, + "clip_ratio/region_mean": 3.2874999135401595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6633.5703125, + "completions/mean_terminated_length": 6319.0400390625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0223619118332863, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024523327592760324, + "learning_rate": 1e-05, + "loss": 0.056, + "num_tokens": 117440743.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05, + "sampling/sampling_logp_difference/max": 10.413415908813477, + "sampling/sampling_logp_difference/mean": 0.02061290666460991, + "step": 154 + }, + { + "clip_ratio/high_max": 1.4537483366439119e-05, + "clip_ratio/high_mean": 3.6343708416097797e-06, + "clip_ratio/low_mean": 3.954866042477079e-05, + "clip_ratio/low_min": 9.874949228105834e-06, + "clip_ratio/region_mean": 4.318303126638057e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15919.0, + "completions/mean_length": 7183.0, + "completions/mean_terminated_length": 6886.193359375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.9815369099378586, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018688985146582127, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 118380687.0, + "reward": 0.2890625, + "reward_std": 0.2498900145292282, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039173126221, + "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05, + "sampling/sampling_logp_difference/max": 11.187394142150879, + "sampling/sampling_logp_difference/mean": 0.019792160019278526, + "step": 155 + }, + { + "clip_ratio/high_max": 7.165636361605721e-06, + "clip_ratio/high_mean": 1.7914090904014301e-06, + "clip_ratio/low_mean": 4.9011068711024564e-05, + "clip_ratio/low_min": 1.0991705721608014e-05, + "clip_ratio/region_mean": 5.0802477687739156e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6324.640625, + "completions/mean_terminated_length": 5829.91748046875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.852975606918335, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002005894435569644, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 119207089.0, + "reward": 0.3984375, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 5.788659223071591e-07, + "sampling/sampling_logp_difference/max": 14.362195014953613, + "sampling/sampling_logp_difference/mean": 0.01853565312922001, + "step": 156 + }, + { + "clip_ratio/high_max": 7.795394822096569e-06, + "clip_ratio/high_mean": 1.948848705524142e-06, + "clip_ratio/low_mean": 3.834237736555224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0291225786859286e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 5723.421875, + "completions/mean_terminated_length": 5290.06494140625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8744911625981331, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002577397273853421, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 119961895.0, + "reward": 0.390625, + "reward_std": 0.34321609139442444, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.07882421463727951, + "sampling/sampling_logp_difference/max": 2.5405349731445312, + "sampling/sampling_logp_difference/mean": 0.018341556191444397, + "step": 157 + }, + { + "clip_ratio/high_max": 9.214097190124448e-06, + "clip_ratio/high_mean": 2.303524297531112e-06, + "clip_ratio/low_mean": 2.636873176697918e-05, + "clip_ratio/low_min": 2.9339967113628518e-06, + "clip_ratio/region_mean": 2.8672255837136618e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7886.015625, + "completions/mean_terminated_length": 7682.064453125, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.9391767829656601, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002552987542003393, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 120990289.0, + "reward": 0.328125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.000899312668479979, + "sampling/sampling_logp_difference/max": 7.013879776000977, + "sampling/sampling_logp_difference/mean": 0.02049873024225235, + "step": 158 + }, + { + "clip_ratio/high_max": 3.406416203688423e-05, + "clip_ratio/high_mean": 9.72330332160709e-06, + "clip_ratio/low_mean": 3.168332909808669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140663151019908e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6173.1640625, + "completions/mean_terminated_length": 6011.087890625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.9148785546422005, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002678362652659416, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 121797958.0, + "reward": 0.4140625, + "reward_std": 0.3608373999595642, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.002013920107856393, + "sampling/sampling_logp_difference/max": 6.207672119140625, + "sampling/sampling_logp_difference/mean": 0.018977735191583633, + "step": 159 + }, + { + "clip_ratio/high_max": 1.8476588593330234e-05, + "clip_ratio/high_mean": 4.6191471483325586e-06, + "clip_ratio/low_mean": 4.459614581264759e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9215293188353826e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 6594.21875, + "completions/mean_terminated_length": 6196.259765625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9486038386821747, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033711253199726343, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 122661170.0, + "reward": 0.3828125, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981356620789, + "sampling/importance_sampling_ratio/min": 0.0002968419576063752, + "sampling/sampling_logp_difference/max": 8.122310638427734, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 160 + }, + { + "clip_ratio/high_max": 7.97335997049231e-06, + "clip_ratio/high_mean": 2.7343705824023345e-06, + "clip_ratio/low_mean": 5.420079878604156e-05, + "clip_ratio/low_min": 4.594068286678521e-06, + "clip_ratio/region_mean": 5.693517005056492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 6533.9453125, + "completions/mean_terminated_length": 6377.595703125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9986584335565567, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017857529455795884, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 123518107.0, + "reward": 0.34375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998549818992615, + "sampling/importance_sampling_ratio/min": 9.012701411847956e-06, + "sampling/sampling_logp_difference/max": 11.616875648498535, + "sampling/sampling_logp_difference/mean": 0.02010391652584076, + "step": 161 + }, + { + "clip_ratio/high_max": 4.470512521947967e-06, + "clip_ratio/high_mean": 1.1176281304869917e-06, + "clip_ratio/low_mean": 3.5141094485879876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.625872295742738e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13212.0, + "completions/mean_length": 5742.21875, + "completions/mean_terminated_length": 5658.42529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0379670709371567, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018227624241262674, + "learning_rate": 1e-05, + "loss": -0.0237, + "num_tokens": 124279031.0, + "reward": 0.21875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.0020977305248379707, + "sampling/sampling_logp_difference/max": 6.16689920425415, + "sampling/sampling_logp_difference/mean": 0.019987668842077255, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0003542683989508e-05, + "clip_ratio/high_mean": 3.21091931709816e-06, + "clip_ratio/low_mean": 5.731009014198207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0521009800140746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7584.703125, + "completions/mean_terminated_length": 7515.41748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.953459307551384, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002219022251665592, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 125270761.0, + "reward": 0.359375, + "reward_std": 0.37033066153526306, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880790710449, + "sampling/importance_sampling_ratio/min": 0.0024849213659763336, + "sampling/sampling_logp_difference/max": 5.997514247894287, + "sampling/sampling_logp_difference/mean": 0.020291510969400406, + "step": 163 + }, + { + "clip_ratio/high_max": 7.734669452474918e-06, + "clip_ratio/high_mean": 1.9336673631187296e-06, + "clip_ratio/low_mean": 3.1135301298945706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3068968605221016e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 4714.671875, + "completions/mean_terminated_length": 4622.78759765625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.018719919025898, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014189074281603098, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 125895279.0, + "reward": 0.3984375, + "reward_std": 0.28383445739746094, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479651451111, + "sampling/importance_sampling_ratio/min": 4.017410901724361e-07, + "sampling/sampling_logp_difference/max": 14.727458000183105, + "sampling/sampling_logp_difference/mean": 0.018739396706223488, + "step": 164 + }, + { + "clip_ratio/high_max": 1.0069575182569679e-05, + "clip_ratio/high_mean": 2.5173937956424197e-06, + "clip_ratio/low_mean": 3.824179225375701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0759185367278405e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15913.0, + "completions/mean_length": 6316.140625, + "completions/mean_terminated_length": 6074.51220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.9325072392821312, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001702460227534175, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 126722881.0, + "reward": 0.4609375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999539852142334, + "sampling/importance_sampling_ratio/min": 0.0012551364488899708, + "sampling/sampling_logp_difference/max": 6.680510997772217, + "sampling/sampling_logp_difference/mean": 0.01929408684372902, + "step": 165 + }, + { + "clip_ratio/high_max": 6.873041002108948e-06, + "clip_ratio/high_mean": 1.718260250527237e-06, + "clip_ratio/low_mean": 3.119859468370123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.291685527528898e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15832.0, + "completions/mean_length": 4687.140625, + "completions/mean_terminated_length": 4595.03955078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 1.0886607319116592, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032931750174611807, + "learning_rate": 1e-05, + "loss": 0.0078, + "num_tokens": 127341715.0, + "reward": 0.28125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.0019364450126886368, + "sampling/sampling_logp_difference/max": 6.246901512145996, + "sampling/sampling_logp_difference/mean": 0.020621225237846375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.773085250533768e-05, + "clip_ratio/high_mean": 4.43271312633442e-06, + "clip_ratio/low_mean": 4.30743207289197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7507033741567284e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14125.0, + "completions/mean_length": 5705.515625, + "completions/mean_terminated_length": 5449.232421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0523068830370903, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031696646474301815, + "learning_rate": 1e-05, + "loss": -0.0414, + "num_tokens": 128093597.0, + "reward": 0.1953125, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619126319885, + "sampling/importance_sampling_ratio/min": 3.197810656274669e-05, + "sampling/sampling_logp_difference/max": 10.350459098815918, + "sampling/sampling_logp_difference/mean": 0.021961934864521027, + "step": 167 + }, + { + "clip_ratio/high_max": 1.885905066956184e-05, + "clip_ratio/high_mean": 4.71476266739046e-06, + "clip_ratio/low_mean": 5.0530389898995054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.524515336219338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15958.0, + "completions/mean_length": 6214.4921875, + "completions/mean_terminated_length": 6053.07177734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.9371421113610268, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023704832419753075, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 128906948.0, + "reward": 0.40625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.0003354824730195105, + "sampling/sampling_logp_difference/max": 7.999940872192383, + "sampling/sampling_logp_difference/mean": 0.01882763020694256, + "step": 168 + }, + { + "clip_ratio/high_max": 3.042072216885572e-05, + "clip_ratio/high_mean": 7.60518054221393e-06, + "clip_ratio/low_mean": 4.5897569179942366e-05, + "clip_ratio/low_min": 8.727477506909054e-06, + "clip_ratio/region_mean": 5.3502750233747065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7127.0703125, + "completions/mean_terminated_length": 7054.18115234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.9854387491941452, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003370177699252963, + "learning_rate": 1e-05, + "loss": 0.1197, + "num_tokens": 129839813.0, + "reward": 0.359375, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999907910823822, + "sampling/importance_sampling_ratio/min": 1.077816432371037e-05, + "sampling/sampling_logp_difference/max": 11.43798828125, + "sampling/sampling_logp_difference/mean": 0.019736800342798233, + "step": 169 + }, + { + "clip_ratio/high_max": 2.1401074718596647e-05, + "clip_ratio/high_mean": 6.243764005375851e-06, + "clip_ratio/low_mean": 3.2797592325550795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.904135610355297e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 6566.2890625, + "completions/mean_terminated_length": 6330.6640625, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.7978609576821327, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026055986527353525, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 130698370.0, + "reward": 0.5, + "reward_std": 0.36295419931411743, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.00031152591691352427, + "sampling/sampling_logp_difference/max": 8.074028015136719, + "sampling/sampling_logp_difference/mean": 0.01787097379565239, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0564424403346493e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0564424403346493e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15576.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7186.2890625, + "completions/mean_terminated_length": 7186.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.0232757329940796, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0023866184055805206, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 131637439.0, + "reward": 0.2734375, + "reward_std": 0.2059282809495926, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207258224487, + "sampling/importance_sampling_ratio/min": 0.0007378471200354397, + "sampling/sampling_logp_difference/max": 7.211773872375488, + "sampling/sampling_logp_difference/mean": 0.02137116715312004, + "step": 171 + }, + { + "clip_ratio/high_max": 4.037900725961663e-05, + "clip_ratio/high_mean": 1.0094751814904157e-05, + "clip_ratio/low_mean": 5.8380828136250784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.847557995115494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 5591.5703125, + "completions/mean_terminated_length": 5420.26220703125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9335208311676979, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003491115989163518, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 132371816.0, + "reward": 0.5, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891459941864, + "sampling/importance_sampling_ratio/min": 0.00012356207298580557, + "sampling/sampling_logp_difference/max": 8.998766899108887, + "sampling/sampling_logp_difference/mean": 0.018760837614536285, + "step": 172 + }, + { + "clip_ratio/high_max": 2.8378776733006816e-06, + "clip_ratio/high_mean": 7.094694183251704e-07, + "clip_ratio/low_mean": 4.4085751369493664e-05, + "clip_ratio/low_min": 6.7955093072669115e-06, + "clip_ratio/region_mean": 4.4795220674132e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7152.3828125, + "completions/mean_terminated_length": 6930.82421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.1329835206270218, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002830669516697526, + "learning_rate": 1e-05, + "loss": 0.0526, + "num_tokens": 133307297.0, + "reward": 0.28125, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.00028047082014381886, + "sampling/sampling_logp_difference/max": 8.179040908813477, + "sampling/sampling_logp_difference/mean": 0.021548541262745857, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0150829439226072e-05, + "clip_ratio/high_mean": 2.537707359806518e-06, + "clip_ratio/low_mean": 3.4009618616437365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654732597624388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 7263.453125, + "completions/mean_terminated_length": 7118.68310546875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.092760555446148, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027821618132293224, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 134260107.0, + "reward": 0.3203125, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 7.832317351130769e-05, + "sampling/sampling_logp_difference/max": 9.454667091369629, + "sampling/sampling_logp_difference/mean": 0.022098438814282417, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0561876024439698e-05, + "clip_ratio/high_mean": 2.6404690061099245e-06, + "clip_ratio/low_mean": 1.6864279416495265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9504748649978865e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15388.0, + "completions/mean_length": 7088.8125, + "completions/mean_terminated_length": 6710.958984375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 1.0669445469975471, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007076738984324038, + "learning_rate": 1e-05, + "loss": -0.0197, + "num_tokens": 135186139.0, + "reward": 0.328125, + "reward_std": 0.20593319833278656, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 3.084653872065246e-05, + "sampling/sampling_logp_difference/max": 10.386486053466797, + "sampling/sampling_logp_difference/mean": 0.020075790584087372, + "step": 175 + }, + { + "clip_ratio/high_max": 7.095016371749807e-06, + "clip_ratio/high_mean": 1.7737540929374518e-06, + "clip_ratio/low_mean": 2.7592465016823553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.936621888238733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15626.0, + "completions/max_terminated_length": 15626.0, + "completions/mean_length": 5352.734375, + "completions/mean_terminated_length": 5352.734375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 1.0387161895632744, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0022445612121373415, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 135888929.0, + "reward": 0.4765625, + "reward_std": 0.399257630109787, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 0.00032565294532105327, + "sampling/sampling_logp_difference/max": 8.029678344726562, + "sampling/sampling_logp_difference/mean": 0.02010166086256504, + "step": 176 + }, + { + "clip_ratio/high_max": 1.5100852124305675e-05, + "clip_ratio/high_mean": 4.426987970873597e-06, + "clip_ratio/low_mean": 2.7625993425317574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2052981168817496e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 7758.90625, + "completions/mean_terminated_length": 7408.29248046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 1.0648984238505363, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022021254990249872, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 136901941.0, + "reward": 0.3671875, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858140945435, + "sampling/importance_sampling_ratio/min": 2.2461865967216e-07, + "sampling/sampling_logp_difference/max": 15.30886173248291, + "sampling/sampling_logp_difference/mean": 0.021426808089017868, + "step": 177 + }, + { + "clip_ratio/high_max": 2.5346608254039893e-05, + "clip_ratio/high_mean": 7.4063813144675805e-06, + "clip_ratio/low_mean": 2.2069365058996482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9475746259777225e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 7036.953125, + "completions/mean_terminated_length": 6496.21484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9684997871518135, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013461806811392307, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 137824623.0, + "reward": 0.34375, + "reward_std": 0.2546031177043915, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 5.834372132085264e-05, + "sampling/sampling_logp_difference/max": 9.74915885925293, + "sampling/sampling_logp_difference/mean": 0.020304443314671516, + "step": 178 + }, + { + "clip_ratio/high_max": 1.3147734080121154e-05, + "clip_ratio/high_mean": 3.2869335200302885e-06, + "clip_ratio/low_mean": 4.841489999307669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.170183294467279e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6114.1875, + "completions/mean_terminated_length": 5951.1748046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.943072073161602, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132438588887453, + "learning_rate": 1e-05, + "loss": 0.0943, + "num_tokens": 138625247.0, + "reward": 0.40625, + "reward_std": 0.321650892496109, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999298453330994, + "sampling/importance_sampling_ratio/min": 0.0017275095451623201, + "sampling/sampling_logp_difference/max": 6.361074447631836, + "sampling/sampling_logp_difference/mean": 0.020084267482161522, + "step": 179 + }, + { + "clip_ratio/high_max": 1.7873157958092634e-05, + "clip_ratio/high_mean": 4.468289489523158e-06, + "clip_ratio/low_mean": 3.5252990301160025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9721279790683184e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15050.0, + "completions/mean_length": 7618.875, + "completions/mean_terminated_length": 7034.53369140625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.9142575263977051, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026741649489849806, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 139619287.0, + "reward": 0.2890625, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.005949751473963261, + "sampling/sampling_logp_difference/max": 5.124405860900879, + "sampling/sampling_logp_difference/mean": 0.020061582326889038, + "step": 180 + }, + { + "clip_ratio/high_max": 1.0512151675357018e-05, + "clip_ratio/high_mean": 2.6280379188392544e-06, + "clip_ratio/low_mean": 4.5301517502593924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.792955542143318e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16106.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 5333.875, + "completions/mean_terminated_length": 5333.875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "entropy": 0.8107482865452766, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027016003150492907, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 140318935.0, + "reward": 0.5703125, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.006856904830783606, + "sampling/sampling_logp_difference/max": 4.982499122619629, + "sampling/sampling_logp_difference/mean": 0.017069874331355095, + "step": 181 + }, + { + "clip_ratio/high_max": 1.85085939392593e-05, + "clip_ratio/high_mean": 5.24943533264377e-06, + "clip_ratio/low_mean": 5.6120721524166584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.137015702734061e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7443.3046875, + "completions/mean_terminated_length": 7154.89501953125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 0.9224414080381393, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002655779244378209, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 141293534.0, + "reward": 0.234375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.00042018835665658116, + "sampling/sampling_logp_difference/max": 7.774807453155518, + "sampling/sampling_logp_difference/mean": 0.02006504125893116, + "step": 182 + }, + { + "clip_ratio/high_max": 1.494229445597739e-05, + "clip_ratio/high_mean": 3.7355736139943474e-06, + "clip_ratio/low_mean": 2.2748562741981004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6484136355975352e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 5646.6875, + "completions/mean_terminated_length": 5646.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8945339694619179, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016281780553981662, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 142037438.0, + "reward": 0.46875, + "reward_std": 0.17912296950817108, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.0005717006279155612, + "sampling/sampling_logp_difference/max": 7.46689510345459, + "sampling/sampling_logp_difference/mean": 0.019336247816681862, + "step": 183 + }, + { + "clip_ratio/high_max": 3.335990868436056e-05, + "clip_ratio/high_mean": 8.33997717109014e-06, + "clip_ratio/low_mean": 3.5050728683927446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339070608239126e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14142.0, + "completions/mean_length": 6384.640625, + "completions/mean_terminated_length": 5892.86865234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.840093269944191, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002166559686884284, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 142873848.0, + "reward": 0.4765625, + "reward_std": 0.35506346821784973, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 4.785555574926548e-06, + "sampling/sampling_logp_difference/max": 12.249908447265625, + "sampling/sampling_logp_difference/mean": 0.018109092488884926, + "step": 184 + }, + { + "clip_ratio/high_max": 1.541105484648142e-05, + "clip_ratio/high_mean": 3.852763711620355e-06, + "clip_ratio/low_mean": 4.0552770769863855e-05, + "clip_ratio/low_min": 7.133888630050933e-06, + "clip_ratio/region_mean": 4.440553459517105e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14828.0, + "completions/mean_length": 5775.0, + "completions/mean_terminated_length": 5691.46435546875, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "entropy": 0.8915362879633904, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021932912059128284, + "learning_rate": 1e-05, + "loss": -0.0086, + "num_tokens": 143636152.0, + "reward": 0.4375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 9.714113069492214e-09, + "sampling/sampling_logp_difference/max": 18.44968605041504, + "sampling/sampling_logp_difference/mean": 0.019278086721897125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7509142171311396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7509142171311396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6181.640625, + "completions/mean_terminated_length": 6019.69873046875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 1.0544511675834656, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022947140969336033, + "learning_rate": 1e-05, + "loss": 0.0242, + "num_tokens": 144447370.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147653579712, + "sampling/importance_sampling_ratio/min": 7.419757253046555e-08, + "sampling/sampling_logp_difference/max": 16.416534423828125, + "sampling/sampling_logp_difference/mean": 0.02050788700580597, + "step": 186 + }, + { + "clip_ratio/high_max": 1.5700999938417226e-05, + "clip_ratio/high_mean": 3.9252499846043065e-06, + "clip_ratio/low_mean": 2.4595847037289786e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8521096965050674e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 6542.3046875, + "completions/mean_terminated_length": 6306.1044921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.933225467801094, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034910975955426693, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 145303505.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.007213745731860399, + "sampling/sampling_logp_difference/max": 4.931766986846924, + "sampling/sampling_logp_difference/mean": 0.020022759214043617, + "step": 187 + }, + { + "clip_ratio/high_max": 6.0999414017715026e-06, + "clip_ratio/high_mean": 1.5249853504428756e-06, + "clip_ratio/low_mean": 2.61421698724007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7667155109156738e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 5889.4765625, + "completions/mean_terminated_length": 5637.6083984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.9649673849344254, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024078311398625374, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 146082198.0, + "reward": 0.3359375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999341368675232, + "sampling/importance_sampling_ratio/min": 0.0008680344326421618, + "sampling/sampling_logp_difference/max": 7.04927921295166, + "sampling/sampling_logp_difference/mean": 0.02060198038816452, + "step": 188 + }, + { + "clip_ratio/high_max": 7.789618393871933e-06, + "clip_ratio/high_mean": 1.9474045984679833e-06, + "clip_ratio/low_mean": 3.6395756637830345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.834316100892465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5349.2421875, + "completions/mean_terminated_length": 5084.408203125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8402756005525589, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021191861014813185, + "learning_rate": 1e-05, + "loss": 0.1275, + "num_tokens": 146786245.0, + "reward": 0.4765625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 3.763807762879878e-05, + "sampling/sampling_logp_difference/max": 10.187494277954102, + "sampling/sampling_logp_difference/mean": 0.017112664878368378, + "step": 189 + }, + { + "clip_ratio/high_max": 1.2461773394534248e-05, + "clip_ratio/high_mean": 3.115443348633562e-06, + "clip_ratio/low_mean": 5.095924211673264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4074685294835945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 7272.3203125, + "completions/mean_terminated_length": 7053.64013671875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.9627499282360077, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022120666690170765, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 147737086.0, + "reward": 0.2890625, + "reward_std": 0.27304792404174805, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05, + "sampling/sampling_logp_difference/max": 10.984610557556152, + "sampling/sampling_logp_difference/mean": 0.0203307643532753, + "step": 190 + }, + { + "clip_ratio/high_max": 1.7891727566166082e-05, + "clip_ratio/high_mean": 4.472931891541521e-06, + "clip_ratio/low_mean": 5.616715043288423e-05, + "clip_ratio/low_min": 7.80031223257538e-06, + "clip_ratio/region_mean": 6.064008221073891e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 6387.1875, + "completions/mean_terminated_length": 5895.54052734375, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.9110158830881119, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030851473566144705, + "learning_rate": 1e-05, + "loss": 0.1091, + "num_tokens": 148573782.0, + "reward": 0.40625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.003961040172725916, + "sampling/sampling_logp_difference/max": 5.531248569488525, + "sampling/sampling_logp_difference/mean": 0.018049638718366623, + "step": 191 + }, + { + "clip_ratio/high_max": 1.6994396901282016e-05, + "clip_ratio/high_mean": 5.400205964178895e-06, + "clip_ratio/low_mean": 3.274822392995702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8148429439388565e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7267.59375, + "completions/mean_terminated_length": 7195.81103515625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.9254888147115707, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020694085396826267, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 149521258.0, + "reward": 0.2734375, + "reward_std": 0.29719972610473633, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 7.411616934405174e-06, + "sampling/sampling_logp_difference/max": 11.812461853027344, + "sampling/sampling_logp_difference/mean": 0.01898832805454731, + "step": 192 + }, + { + "clip_ratio/high_max": 4.10414668294834e-06, + "clip_ratio/high_mean": 1.026036670737085e-06, + "clip_ratio/low_mean": 4.7441100377909606e-05, + "clip_ratio/low_min": 4.552241534838686e-06, + "clip_ratio/region_mean": 4.8467136821273016e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16076.0, + "completions/mean_length": 7100.1953125, + "completions/mean_terminated_length": 6952.83349609375, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.8455610796809196, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003085972974076867, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 150447923.0, + "reward": 0.25, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.0011708807433024049, + "sampling/sampling_logp_difference/max": 6.749999046325684, + "sampling/sampling_logp_difference/mean": 0.01974140852689743, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6514521121280268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6514521121280268e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15535.0, + "completions/mean_length": 6626.4296875, + "completions/mean_terminated_length": 6549.5986328125, + "completions/min_length": 1746.0, + "completions/min_terminated_length": 1746.0, + "entropy": 1.0323699787259102, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003505800850689411, + "learning_rate": 1e-05, + "loss": 0.0885, + "num_tokens": 151313834.0, + "reward": 0.390625, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381303787231, + "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05, + "sampling/sampling_logp_difference/max": 10.479642868041992, + "sampling/sampling_logp_difference/mean": 0.021082937717437744, + "step": 194 + }, + { + "clip_ratio/high_max": 2.006086378969485e-05, + "clip_ratio/high_mean": 5.890002398700744e-06, + "clip_ratio/low_mean": 3.503898199141986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.092898473118112e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 7093.109375, + "completions/mean_terminated_length": 6870.12841796875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 1.0206764563918114, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002495395252481103, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 152238192.0, + "reward": 0.2890625, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 9.536534344078973e-05, + "sampling/sampling_logp_difference/max": 9.257795333862305, + "sampling/sampling_logp_difference/mean": 0.020610272884368896, + "step": 195 + }, + { + "clip_ratio/high_max": 3.2352409107261337e-06, + "clip_ratio/high_mean": 8.088102276815334e-07, + "clip_ratio/low_mean": 4.056704699451075e-05, + "clip_ratio/low_min": 1.1648833606159315e-05, + "clip_ratio/region_mean": 4.1375856994818605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14191.0, + "completions/mean_length": 6795.71875, + "completions/mean_terminated_length": 6486.4189453125, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "entropy": 0.8927837759256363, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014066790463402867, + "learning_rate": 1e-05, + "loss": -0.0031, + "num_tokens": 153131828.0, + "reward": 0.3359375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 5.093755135021638e-06, + "sampling/sampling_logp_difference/max": 12.187495231628418, + "sampling/sampling_logp_difference/mean": 0.01874586008489132, + "step": 196 + }, + { + "clip_ratio/high_max": 1.5244630048982799e-05, + "clip_ratio/high_mean": 3.8111575122456998e-06, + "clip_ratio/low_mean": 3.655197178886738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.03631290737394e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15831.0, + "completions/mean_length": 7075.1015625, + "completions/mean_terminated_length": 6617.28662109375, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.8989318311214447, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017937121447175741, + "learning_rate": 1e-05, + "loss": 0.0359, + "num_tokens": 154057097.0, + "reward": 0.3984375, + "reward_std": 0.23068872094154358, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.00021659507183358073, + "sampling/sampling_logp_difference/max": 8.437480926513672, + "sampling/sampling_logp_difference/mean": 0.01890135183930397, + "step": 197 + }, + { + "clip_ratio/high_max": 1.4074375030759256e-05, + "clip_ratio/high_mean": 4.977033995601232e-06, + "clip_ratio/low_mean": 3.2670792506905855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.764782627513341e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14100.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6743.41455078125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.8758384585380554, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410576842725277, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 154988585.0, + "reward": 0.3984375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.003589102067053318, + "sampling/sampling_logp_difference/max": 5.629853248596191, + "sampling/sampling_logp_difference/mean": 0.018400676548480988, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.977112736994968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.977112736994968e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6590.6796875, + "completions/mean_terminated_length": 6513.56689453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9243742749094963, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003304310142993927, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 155851000.0, + "reward": 0.3984375, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579787254333, + "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06, + "sampling/sampling_logp_difference/max": 13.576997756958008, + "sampling/sampling_logp_difference/mean": 0.01959652081131935, + "step": 199 + }, + { + "clip_ratio/high_max": 1.1435367014200892e-05, + "clip_ratio/high_mean": 2.858841753550223e-06, + "clip_ratio/low_mean": 4.7742656533955596e-05, + "clip_ratio/low_min": 8.646529749967158e-06, + "clip_ratio/region_mean": 5.0601498060132144e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6999.484375, + "completions/mean_terminated_length": 6696.7578125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.843244343996048, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023830258287489414, + "learning_rate": 1e-05, + "loss": 0.1142, + "num_tokens": 156766782.0, + "reward": 0.359375, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635053634644, + "sampling/importance_sampling_ratio/min": 0.00014761318743694574, + "sampling/sampling_logp_difference/max": 8.820915222167969, + "sampling/sampling_logp_difference/mean": 0.018434934318065643, + "step": 200 + }, + { + "clip_ratio/high_max": 2.5114631171163637e-05, + "clip_ratio/high_mean": 7.040741365926806e-06, + "clip_ratio/low_mean": 5.3607667723554187e-05, + "clip_ratio/low_min": 9.219345429301029e-06, + "clip_ratio/region_mean": 6.064840863473364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14986.0, + "completions/mean_length": 6407.5, + "completions/mean_terminated_length": 6249.14306640625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.9549195989966393, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024427250027656555, + "learning_rate": 1e-05, + "loss": 0.0795, + "num_tokens": 157606126.0, + "reward": 0.3515625, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0002305622911080718, + "sampling/sampling_logp_difference/max": 8.37498950958252, + "sampling/sampling_logp_difference/mean": 0.0192743968218565, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.928529067958152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.928529067958152e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6638.390625, + "completions/mean_terminated_length": 5901.328125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.9070822075009346, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002024515997618437, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 158474248.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999830722808838, + "sampling/importance_sampling_ratio/min": 0.0036068728659301996, + "sampling/sampling_logp_difference/max": 5.624914169311523, + "sampling/sampling_logp_difference/mean": 0.01955476775765419, + "step": 202 + }, + { + "clip_ratio/high_max": 8.365173471247545e-06, + "clip_ratio/high_mean": 2.091293367811886e-06, + "clip_ratio/low_mean": 4.1470637825113954e-05, + "clip_ratio/low_min": 4.027710474474588e-06, + "clip_ratio/region_mean": 4.356193130661268e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 7324.546875, + "completions/mean_terminated_length": 6878.99951171875, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9108889549970627, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022787705529481173, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 159434350.0, + "reward": 0.3359375, + "reward_std": 0.26515230536460876, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.03948089852929115, + "sampling/sampling_logp_difference/max": 3.231938362121582, + "sampling/sampling_logp_difference/mean": 0.019122496247291565, + "step": 203 + }, + { + "clip_ratio/high_max": 8.65733409227687e-06, + "clip_ratio/high_mean": 2.1643335230692173e-06, + "clip_ratio/low_mean": 3.456336048657249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.672769389595487e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13983.0, + "completions/mean_length": 5520.4453125, + "completions/mean_terminated_length": 5434.9052734375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.8982062339782715, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026195270475000143, + "learning_rate": 1e-05, + "loss": 0.049, + "num_tokens": 160163055.0, + "reward": 0.4375, + "reward_std": 0.24831004440784454, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.0005541297141462564, + "sampling/sampling_logp_difference/max": 7.498111724853516, + "sampling/sampling_logp_difference/mean": 0.019064132124185562, + "step": 204 + }, + { + "clip_ratio/high_max": 1.8376186289970065e-05, + "clip_ratio/high_mean": 6.650576210631698e-06, + "clip_ratio/low_mean": 4.059042771586974e-05, + "clip_ratio/low_min": 5.350111223378917e-06, + "clip_ratio/region_mean": 4.724100449493562e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15267.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 6846.515625, + "completions/mean_terminated_length": 6846.515625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9657742157578468, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014831912703812122, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 161057657.0, + "reward": 0.296875, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 6.252834282349795e-05, + "sampling/sampling_logp_difference/max": 9.679890632629395, + "sampling/sampling_logp_difference/mean": 0.020372584462165833, + "step": 205 + }, + { + "clip_ratio/high_max": 1.658901419432368e-05, + "clip_ratio/high_mean": 4.14725354858092e-06, + "clip_ratio/low_mean": 4.473214539757464e-05, + "clip_ratio/low_min": 2.9674999950657366e-06, + "clip_ratio/region_mean": 4.887939894615556e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16370.0, + "completions/mean_length": 6946.8984375, + "completions/mean_terminated_length": 6642.4755859375, + "completions/min_length": 1133.0, + "completions/min_terminated_length": 1133.0, + "entropy": 0.8490508273243904, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017962189158424735, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 161966356.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 7.035569433355704e-05, + "sampling/sampling_logp_difference/max": 9.561946868896484, + "sampling/sampling_logp_difference/mean": 0.019146796315908432, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.22491199540309e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.22491199540309e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15123.0, + "completions/mean_length": 6618.9765625, + "completions/mean_terminated_length": 6463.9765625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.9541772454977036, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017619321588426828, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 162836705.0, + "reward": 0.390625, + "reward_std": 0.2130674123764038, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07, + "sampling/sampling_logp_difference/max": 14.680485725402832, + "sampling/sampling_logp_difference/mean": 0.020236656069755554, + "step": 207 + }, + { + "clip_ratio/high_max": 1.6846054222696694e-05, + "clip_ratio/high_mean": 4.211513555674173e-06, + "clip_ratio/low_mean": 3.877300162002939e-05, + "clip_ratio/low_min": 4.230834292684449e-06, + "clip_ratio/region_mean": 4.298451551676408e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12469.0, + "completions/mean_length": 5485.71875, + "completions/mean_terminated_length": 5312.73046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.8888534903526306, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002670915797352791, + "learning_rate": 1e-05, + "loss": 0.0709, + "num_tokens": 163558197.0, + "reward": 0.46875, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.0005042250850237906, + "sampling/sampling_logp_difference/max": 7.592487812042236, + "sampling/sampling_logp_difference/mean": 0.019581373780965805, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6889288480779214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889288480779214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16184.0, + "completions/mean_length": 4345.171875, + "completions/mean_terminated_length": 4250.3779296875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.8308270424604416, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004005427472293377, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 164133499.0, + "reward": 0.578125, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247193336487, + "sampling/importance_sampling_ratio/min": 0.022981969639658928, + "sampling/sampling_logp_difference/max": 3.773045301437378, + "sampling/sampling_logp_difference/mean": 0.017508968710899353, + "step": 209 + }, + { + "clip_ratio/high_max": 1.2997116300539346e-05, + "clip_ratio/high_mean": 3.2492790751348366e-06, + "clip_ratio/low_mean": 2.723402121773688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0483300406558556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5227.296875, + "completions/mean_terminated_length": 5050.20654296875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 0.9231975972652435, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031033784616738558, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 164823681.0, + "reward": 0.4765625, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999896764755249, + "sampling/importance_sampling_ratio/min": 0.0021342060063034296, + "sampling/sampling_logp_difference/max": 6.149660587310791, + "sampling/sampling_logp_difference/mean": 0.019171088933944702, + "step": 210 + }, + { + "clip_ratio/high_max": 2.0835890609305352e-05, + "clip_ratio/high_mean": 5.208972652326338e-06, + "clip_ratio/low_mean": 2.9314877565411734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.452385044511175e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14160.0, + "completions/mean_length": 6473.4765625, + "completions/mean_terminated_length": 6316.1669921875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 0.9061874598264694, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003495733719319105, + "learning_rate": 1e-05, + "loss": 0.0785, + "num_tokens": 165668798.0, + "reward": 0.4765625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000354051589966, + "sampling/importance_sampling_ratio/min": 0.0004697878030128777, + "sampling/sampling_logp_difference/max": 7.663229465484619, + "sampling/sampling_logp_difference/mean": 0.018978482112288475, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991967162164656e-05, + "clip_ratio/low_min": 6.304534053924726e-06, + "clip_ratio/region_mean": 3.991967162164656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14659.0, + "completions/mean_length": 7140.1953125, + "completions/mean_terminated_length": 6605.4296875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.9605444446206093, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002381941769272089, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 166603375.0, + "reward": 0.3046875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 0.00043123820796608925, + "sampling/sampling_logp_difference/max": 7.748849868774414, + "sampling/sampling_logp_difference/mean": 0.021141134202480316, + "step": 212 + }, + { + "clip_ratio/high_max": 1.4948576790629886e-05, + "clip_ratio/high_mean": 3.7371441976574715e-06, + "clip_ratio/low_mean": 3.4953729482367635e-05, + "clip_ratio/low_min": 3.991060111729894e-06, + "clip_ratio/region_mean": 3.869087413477246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13770.0, + "completions/mean_length": 5304.46875, + "completions/mean_terminated_length": 5038.56005859375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9176690131425858, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040566748939454556, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 167302275.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999827742576599, + "sampling/importance_sampling_ratio/min": 5.001809313398553e-07, + "sampling/sampling_logp_difference/max": 14.508296012878418, + "sampling/sampling_logp_difference/mean": 0.018822530284523964, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.653866999935417e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.653866999935417e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5796.5, + "completions/mean_terminated_length": 5542.400390625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.9230027198791504, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021502040326595306, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 168063627.0, + "reward": 0.3828125, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223351478577, + "sampling/importance_sampling_ratio/min": 0.009504453279078007, + "sampling/sampling_logp_difference/max": 4.655994892120361, + "sampling/sampling_logp_difference/mean": 0.01985779032111168, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0863841453101486e-05, + "clip_ratio/high_mean": 2.7159603632753715e-06, + "clip_ratio/low_mean": 2.4175752741939505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6891713218901714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14814.0, + "completions/mean_length": 6135.4921875, + "completions/mean_terminated_length": 6054.79541015625, + "completions/min_length": 1259.0, + "completions/min_terminated_length": 1259.0, + "entropy": 0.869445689022541, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027786416467279196, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 168867858.0, + "reward": 0.4609375, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05, + "sampling/sampling_logp_difference/max": 10.553963661193848, + "sampling/sampling_logp_difference/mean": 0.018514130264520645, + "step": 215 + }, + { + "clip_ratio/high_max": 4.36788013757905e-06, + "clip_ratio/high_mean": 1.0919700343947625e-06, + "clip_ratio/low_mean": 1.993327998661698e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0852980330564606e-06, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 6268.2421875, + "completions/mean_terminated_length": 6025.46435546875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.951081782579422, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0007328780484385788, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 169689969.0, + "reward": 0.3828125, + "reward_std": 0.10994865000247955, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05, + "sampling/sampling_logp_difference/max": 11.003040313720703, + "sampling/sampling_logp_difference/mean": 0.02005261555314064, + "step": 216 + }, + { + "clip_ratio/high_max": 2.131336282218399e-05, + "clip_ratio/high_mean": 5.3283407055459975e-06, + "clip_ratio/low_mean": 3.5254403428552905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.058274430462916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13861.0, + "completions/mean_length": 5440.8984375, + "completions/mean_terminated_length": 5354.732421875, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.8271932750940323, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034721922129392624, + "learning_rate": 1e-05, + "loss": -0.0245, + "num_tokens": 170409292.0, + "reward": 0.53125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998912811279297, + "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05, + "sampling/sampling_logp_difference/max": 10.904656410217285, + "sampling/sampling_logp_difference/mean": 0.019136395305395126, + "step": 217 + }, + { + "clip_ratio/high_max": 1.2339016848272877e-05, + "clip_ratio/high_mean": 4.13687178024702e-06, + "clip_ratio/low_mean": 2.156280152121326e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.569967330146028e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15086.0, + "completions/mean_length": 6671.046875, + "completions/mean_terminated_length": 6594.56689453125, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.9659745842218399, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027575206477195024, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 171280714.0, + "reward": 0.375, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411702156067, + "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05, + "sampling/sampling_logp_difference/max": 11.06179428100586, + "sampling/sampling_logp_difference/mean": 0.019089506939053535, + "step": 218 + }, + { + "clip_ratio/high_max": 1.4603458112105727e-05, + "clip_ratio/high_mean": 3.650864528026432e-06, + "clip_ratio/low_mean": 3.2977761520669446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.662862599185246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7781.5546875, + "completions/mean_terminated_length": 7504.05615234375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 1.1691131889820099, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012711051385849714, + "learning_rate": 1e-05, + "loss": 0.0115, + "num_tokens": 172302489.0, + "reward": 0.109375, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998820424079895, + "sampling/importance_sampling_ratio/min": 0.005086081102490425, + "sampling/sampling_logp_difference/max": 5.281247615814209, + "sampling/sampling_logp_difference/mean": 0.023309212177991867, + "step": 219 + }, + { + "clip_ratio/high_max": 6.842087486802484e-06, + "clip_ratio/high_mean": 1.710521871700621e-06, + "clip_ratio/low_mean": 4.5269940528669395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6980462457213434e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14891.0, + "completions/mean_length": 6489.96875, + "completions/mean_terminated_length": 6332.9208984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9354017227888107, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016933141741901636, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 173149653.0, + "reward": 0.484375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 0.008998609147965908, + "sampling/sampling_logp_difference/max": 4.7106852531433105, + "sampling/sampling_logp_difference/mean": 0.019165027886629105, + "step": 220 + }, + { + "clip_ratio/high_max": 2.444740721330163e-05, + "clip_ratio/high_mean": 6.111851803325408e-06, + "clip_ratio/low_mean": 3.0998270403870265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.711012095664046e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14943.0, + "completions/max_terminated_length": 14943.0, + "completions/mean_length": 6309.75, + "completions/mean_terminated_length": 6309.75, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.012483686208725, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024940327275544405, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 173976797.0, + "reward": 0.4375, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 0.0018720829393714666, + "sampling/sampling_logp_difference/max": 6.280703544616699, + "sampling/sampling_logp_difference/mean": 0.020797956734895706, + "step": 221 + }, + { + "clip_ratio/high_max": 1.1112337460872368e-05, + "clip_ratio/high_mean": 3.5388877677178243e-06, + "clip_ratio/low_mean": 1.7024583712554886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.056347148027271e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7574.984375, + "completions/mean_terminated_length": 7363.568359375, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9144782647490501, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002748408354818821, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 174965259.0, + "reward": 0.2734375, + "reward_std": 0.25224411487579346, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000108480453491, + "sampling/importance_sampling_ratio/min": 0.005681300535798073, + "sampling/sampling_logp_difference/max": 5.170575141906738, + "sampling/sampling_logp_difference/mean": 0.019229793921113014, + "step": 222 + }, + { + "clip_ratio/high_max": 1.4946090004741563e-05, + "clip_ratio/high_mean": 3.736522501185391e-06, + "clip_ratio/low_mean": 3.722507381098694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096159636901575e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6962.7734375, + "completions/mean_terminated_length": 6499.43408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9248140156269073, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020343128126114607, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 175876446.0, + "reward": 0.421875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.0001609467581147328, + "sampling/sampling_logp_difference/max": 8.734436988830566, + "sampling/sampling_logp_difference/mean": 0.01860032044351101, + "step": 223 + }, + { + "clip_ratio/high_max": 4.226114015182247e-06, + "clip_ratio/high_mean": 1.0565285037955618e-06, + "clip_ratio/low_mean": 3.189400638348161e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295053488727717e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14978.0, + "completions/mean_length": 6422.28125, + "completions/mean_terminated_length": 6264.1591796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.7786787301301956, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029119597747921944, + "learning_rate": 1e-05, + "loss": 0.1116, + "num_tokens": 176717226.0, + "reward": 0.578125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.0006287595024332404, + "sampling/sampling_logp_difference/max": 7.371761798858643, + "sampling/sampling_logp_difference/mean": 0.01786171644926071, + "step": 224 + }, + { + "clip_ratio/high_max": 5.4112551879370585e-06, + "clip_ratio/high_mean": 1.3528137969842646e-06, + "clip_ratio/low_mean": 2.103693077515345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2389744572137715e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6662.65625, + "completions/mean_terminated_length": 6508.349609375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9501350447535515, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0027519147843122482, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 177586766.0, + "reward": 0.421875, + "reward_std": 0.21382881700992584, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000051259994507, + "sampling/importance_sampling_ratio/min": 2.507045428501442e-05, + "sampling/sampling_logp_difference/max": 10.593820571899414, + "sampling/sampling_logp_difference/mean": 0.020679686218500137, + "step": 225 + }, + { + "clip_ratio/high_max": 3.2487785119883483e-06, + "clip_ratio/high_mean": 8.121946279970871e-07, + "clip_ratio/low_mean": 5.783435085504607e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8646545539886574e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 6546.171875, + "completions/mean_terminated_length": 6146.259765625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9217342138290405, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017936143558472395, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178444556.0, + "reward": 0.3984375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 8.447741129202768e-05, + "sampling/sampling_logp_difference/max": 9.379026412963867, + "sampling/sampling_logp_difference/mean": 0.019764548167586327, + "step": 226 + }, + { + "clip_ratio/high_max": 2.1980493102091714e-05, + "clip_ratio/high_mean": 5.4951232755229285e-06, + "clip_ratio/low_mean": 4.3977801396977156e-05, + "clip_ratio/low_min": 7.912247156127705e-06, + "clip_ratio/region_mean": 4.947292427459615e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15707.0, + "completions/max_terminated_length": 15707.0, + "completions/mean_length": 6433.9296875, + "completions/mean_terminated_length": 6433.9296875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9361409991979599, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031324021983891726, + "learning_rate": 1e-05, + "loss": 0.0505, + "num_tokens": 179288499.0, + "reward": 0.453125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.00018488657951820642, + "sampling/sampling_logp_difference/max": 8.595767974853516, + "sampling/sampling_logp_difference/mean": 0.019691072404384613, + "step": 227 + }, + { + "clip_ratio/high_max": 1.299416817346355e-05, + "clip_ratio/high_mean": 3.2485420433658874e-06, + "clip_ratio/low_mean": 3.756406420052372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.081260635757644e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15787.0, + "completions/mean_length": 6037.75, + "completions/mean_terminated_length": 5873.52392578125, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8700985535979271, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024714914616197348, + "learning_rate": 1e-05, + "loss": 0.0044, + "num_tokens": 180079619.0, + "reward": 0.484375, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999628067016602, + "sampling/importance_sampling_ratio/min": 8.4841696661897e-05, + "sampling/sampling_logp_difference/max": 9.374723434448242, + "sampling/sampling_logp_difference/mean": 0.018519341945648193, + "step": 228 + }, + { + "clip_ratio/high_max": 7.293307589861797e-06, + "clip_ratio/high_mean": 1.8233268974654493e-06, + "clip_ratio/low_mean": 2.2305866423266707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.412919320704532e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12264.0, + "completions/max_terminated_length": 12264.0, + "completions/mean_length": 5305.828125, + "completions/mean_terminated_length": 5305.828125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 1.1309608668088913, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003593914210796356, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 180780877.0, + "reward": 0.3984375, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.009941472671926022, + "sampling/sampling_logp_difference/max": 4.611040115356445, + "sampling/sampling_logp_difference/mean": 0.020471621304750443, + "step": 229 + }, + { + "clip_ratio/high_max": 2.0163415001661633e-05, + "clip_ratio/high_mean": 5.040853750415408e-06, + "clip_ratio/low_mean": 4.4980357415624894e-05, + "clip_ratio/low_min": 1.0012816346716136e-05, + "clip_ratio/region_mean": 5.0021211109196884e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13814.0, + "completions/mean_length": 6022.96875, + "completions/mean_terminated_length": 5774.30419921875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8560900762677193, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029816587921231985, + "learning_rate": 1e-05, + "loss": 0.0913, + "num_tokens": 181571465.0, + "reward": 0.515625, + "reward_std": 0.41504397988319397, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05, + "sampling/sampling_logp_difference/max": 11.04552936553955, + "sampling/sampling_logp_difference/mean": 0.0181986466050148, + "step": 230 + }, + { + "clip_ratio/high_max": 1.8430865566188004e-05, + "clip_ratio/high_mean": 6.177042905619601e-06, + "clip_ratio/low_mean": 4.450247388376738e-05, + "clip_ratio/low_min": 4.840271230932558e-06, + "clip_ratio/region_mean": 5.067951724413433e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15130.0, + "completions/max_terminated_length": 15130.0, + "completions/mean_length": 6647.71875, + "completions/mean_terminated_length": 6647.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9455481320619583, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0031632622703909874, + "learning_rate": 1e-05, + "loss": 0.1317, + "num_tokens": 182440957.0, + "reward": 0.3828125, + "reward_std": 0.39902517199516296, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05, + "sampling/sampling_logp_difference/max": 11.124979019165039, + "sampling/sampling_logp_difference/mean": 0.01906408555805683, + "step": 231 + }, + { + "clip_ratio/high_max": 2.2937053017813014e-05, + "clip_ratio/high_mean": 5.7342632544532535e-06, + "clip_ratio/low_mean": 6.042617155799235e-05, + "clip_ratio/low_min": 1.1000354334100848e-05, + "clip_ratio/region_mean": 6.616043401663774e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15988.0, + "completions/mean_length": 6809.1640625, + "completions/mean_terminated_length": 6500.29833984375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 1.050546184182167, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00162694591563195, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 183332242.0, + "reward": 0.421875, + "reward_std": 0.33616161346435547, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000290870666504, + "sampling/importance_sampling_ratio/min": 4.244970114086755e-06, + "sampling/sampling_logp_difference/max": 12.369775772094727, + "sampling/sampling_logp_difference/mean": 0.021866722032427788, + "step": 232 + }, + { + "clip_ratio/high_max": 1.4678411844215589e-05, + "clip_ratio/high_mean": 3.669602961053897e-06, + "clip_ratio/low_mean": 2.4373607971028832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8043211159456405e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6815.5, + "completions/mean_terminated_length": 6506.83837890625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 1.060033954679966, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024887355975806713, + "learning_rate": 1e-05, + "loss": 0.1059, + "num_tokens": 184225138.0, + "reward": 0.328125, + "reward_std": 0.2869548499584198, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999393820762634, + "sampling/importance_sampling_ratio/min": 0.00012930770753882825, + "sampling/sampling_logp_difference/max": 8.953315734863281, + "sampling/sampling_logp_difference/mean": 0.02019432932138443, + "step": 233 + }, + { + "clip_ratio/high_max": 7.910891326901037e-06, + "clip_ratio/high_mean": 1.9777228317252593e-06, + "clip_ratio/low_mean": 3.8802519611635944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.078024221598753e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6928.4453125, + "completions/mean_terminated_length": 6623.42724609375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9051575735211372, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002783838426694274, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 185136323.0, + "reward": 0.3359375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05, + "sampling/sampling_logp_difference/max": 11.498395919799805, + "sampling/sampling_logp_difference/mean": 0.01905050128698349, + "step": 234 + }, + { + "clip_ratio/high_max": 4.399394583742833e-06, + "clip_ratio/high_mean": 1.0998486459357082e-06, + "clip_ratio/low_mean": 1.733424267058581e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8434091430208355e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14703.0, + "completions/mean_length": 7155.1328125, + "completions/mean_terminated_length": 7082.46435546875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 1.0119014978408813, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002105508930981159, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 186071324.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999904990196228, + "sampling/importance_sampling_ratio/min": 0.003494206117466092, + "sampling/sampling_logp_difference/max": 5.656649112701416, + "sampling/sampling_logp_difference/mean": 0.020860780030488968, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0561529961705673e-05, + "clip_ratio/high_mean": 3.4390433256703545e-06, + "clip_ratio/low_mean": 2.8499469067355676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193851205196552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16176.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7463.2421875, + "completions/mean_terminated_length": 7463.2421875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.9983502700924873, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013582308311015368, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 187045035.0, + "reward": 0.3984375, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 0.000473080639494583, + "sampling/sampling_logp_difference/max": 7.65624475479126, + "sampling/sampling_logp_difference/mean": 0.021131811663508415, + "step": 236 + }, + { + "clip_ratio/high_max": 8.509013468938065e-06, + "clip_ratio/high_mean": 2.127253367234516e-06, + "clip_ratio/low_mean": 3.985050443588989e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.197775751890731e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14938.0, + "completions/mean_length": 6460.984375, + "completions/mean_terminated_length": 6382.8505859375, + "completions/min_length": 1747.0, + "completions/min_terminated_length": 1747.0, + "entropy": 0.7869217246770859, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002681629965081811, + "learning_rate": 1e-05, + "loss": 0.0987, + "num_tokens": 187889609.0, + "reward": 0.5234375, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0015037209959700704, + "sampling/sampling_logp_difference/max": 6.499812602996826, + "sampling/sampling_logp_difference/mean": 0.016937749460339546, + "step": 237 + }, + { + "clip_ratio/high_max": 1.2362176221358823e-05, + "clip_ratio/high_mean": 3.0905440553397057e-06, + "clip_ratio/low_mean": 5.0333514764133724e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342405825103924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 6241.78125, + "completions/mean_terminated_length": 6161.92138671875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.0217387825250626, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021239183843135834, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 188706605.0, + "reward": 0.2578125, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.004853047896176577, + "sampling/sampling_logp_difference/max": 5.328148365020752, + "sampling/sampling_logp_difference/mean": 0.02103862166404724, + "step": 238 + }, + { + "clip_ratio/high_max": 6.725130333506968e-06, + "clip_ratio/high_mean": 1.681282583376742e-06, + "clip_ratio/low_mean": 3.437372129155847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.605500387493521e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15332.0, + "completions/mean_length": 5638.1328125, + "completions/mean_terminated_length": 5553.51953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.7844365313649178, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023868419229984283, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 189446294.0, + "reward": 0.515625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0008047468145377934, + "sampling/sampling_logp_difference/max": 7.124982833862305, + "sampling/sampling_logp_difference/mean": 0.017401430755853653, + "step": 239 + }, + { + "clip_ratio/high_max": 2.887730215661577e-05, + "clip_ratio/high_mean": 7.219325539153942e-06, + "clip_ratio/low_mean": 2.826443028425274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.548375502759882e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16196.0, + "completions/mean_length": 6374.8046875, + "completions/mean_terminated_length": 6215.9287109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9472770467400551, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027549315709620714, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 190281461.0, + "reward": 0.3984375, + "reward_std": 0.3167053163051605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998682737350464, + "sampling/importance_sampling_ratio/min": 7.100860239006579e-05, + "sampling/sampling_logp_difference/max": 9.552709579467773, + "sampling/sampling_logp_difference/mean": 0.020243138074874878, + "step": 240 + }, + { + "clip_ratio/high_max": 1.586787766427733e-05, + "clip_ratio/high_mean": 3.9669694160693325e-06, + "clip_ratio/low_mean": 2.978218674343225e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.374915604581474e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15097.0, + "completions/mean_length": 6654.21875, + "completions/mean_terminated_length": 6499.88134765625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 1.0028243213891983, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013344973558560014, + "learning_rate": 1e-05, + "loss": 0.0184, + "num_tokens": 191156249.0, + "reward": 0.359375, + "reward_std": 0.22832971811294556, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 0.0021875568199902773, + "sampling/sampling_logp_difference/max": 6.124969959259033, + "sampling/sampling_logp_difference/mean": 0.020470600575208664, + "step": 241 + }, + { + "clip_ratio/high_max": 1.681529829511419e-05, + "clip_ratio/high_mean": 4.9954849146160996e-06, + "clip_ratio/low_mean": 2.040554932136729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5401033553862362e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6767.7890625, + "completions/mean_terminated_length": 6537.00048828125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9059296399354935, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016136945923790336, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 192040526.0, + "reward": 0.4921875, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05, + "sampling/sampling_logp_difference/max": 11.29355525970459, + "sampling/sampling_logp_difference/mean": 0.020058143883943558, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9821966563758906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9821966563758906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16275.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 6767.4921875, + "completions/mean_terminated_length": 6767.4921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 1.0446822568774223, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002869367366656661, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 192926469.0, + "reward": 0.3828125, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586343765259, + "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05, + "sampling/sampling_logp_difference/max": 10.853924751281738, + "sampling/sampling_logp_difference/mean": 0.021512050181627274, + "step": 243 + }, + { + "clip_ratio/high_max": 3.44581130775623e-05, + "clip_ratio/high_mean": 1.3001711295146379e-05, + "clip_ratio/low_mean": 3.6407937841431703e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.940964981869911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16261.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 5738.484375, + "completions/mean_terminated_length": 5738.484375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.8617956340312958, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002177527640014887, + "learning_rate": 1e-05, + "loss": -0.0189, + "num_tokens": 193678859.0, + "reward": 0.5546875, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.0008533780346624553, + "sampling/sampling_logp_difference/max": 7.06630802154541, + "sampling/sampling_logp_difference/mean": 0.018141131848096848, + "step": 244 + }, + { + "clip_ratio/high_max": 3.861003733618418e-06, + "clip_ratio/high_mean": 9.652509334046044e-07, + "clip_ratio/low_mean": 2.7767115511778684e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8732366558870126e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 6382.90625, + "completions/mean_terminated_length": 5976.357421875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.8692388981580734, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004127771593630314, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 194511847.0, + "reward": 0.4140625, + "reward_std": 0.2767002582550049, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06, + "sampling/sampling_logp_difference/max": 12.124691009521484, + "sampling/sampling_logp_difference/mean": 0.018376430496573448, + "step": 245 + }, + { + "clip_ratio/high_max": 9.728395525598899e-06, + "clip_ratio/high_mean": 2.4320988813997246e-06, + "clip_ratio/low_mean": 5.3631663831765763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.606376271316549e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14504.0, + "completions/max_terminated_length": 14504.0, + "completions/mean_length": 5776.15625, + "completions/mean_terminated_length": 5776.15625, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 1.1195004731416702, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00263008801266551, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 195270051.0, + "reward": 0.421875, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971866607666, + "sampling/importance_sampling_ratio/min": 0.005209421273320913, + "sampling/sampling_logp_difference/max": 5.257286548614502, + "sampling/sampling_logp_difference/mean": 0.019923292100429535, + "step": 246 + }, + { + "clip_ratio/high_max": 1.2701100786216557e-05, + "clip_ratio/high_mean": 3.1752751965541393e-06, + "clip_ratio/low_mean": 4.2162768181697174e-05, + "clip_ratio/low_min": 3.873926743835909e-06, + "clip_ratio/region_mean": 4.5338043378251314e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 7411.421875, + "completions/mean_terminated_length": 7196.08056640625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.9801053553819656, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002642859937623143, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 196240913.0, + "reward": 0.390625, + "reward_std": 0.27328529953956604, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999198913574219, + "sampling/importance_sampling_ratio/min": 0.00017500204558018595, + "sampling/sampling_logp_difference/max": 8.650712966918945, + "sampling/sampling_logp_difference/mean": 0.021511007100343704, + "step": 247 + }, + { + "clip_ratio/high_max": 1.5122936929401476e-05, + "clip_ratio/high_mean": 3.780734232350369e-06, + "clip_ratio/low_mean": 6.367217611114029e-05, + "clip_ratio/low_min": 4.8010447244450916e-06, + "clip_ratio/region_mean": 6.745291057086433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16127.0, + "completions/mean_length": 7944.65625, + "completions/mean_terminated_length": 7742.1123046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 1.0132562816143036, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002439325675368309, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 197278517.0, + "reward": 0.34375, + "reward_std": 0.3161812424659729, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05, + "sampling/sampling_logp_difference/max": 11.49896240234375, + "sampling/sampling_logp_difference/mean": 0.02124868705868721, + "step": 248 + }, + { + "clip_ratio/high_max": 2.6017536356448545e-05, + "clip_ratio/high_mean": 6.504384089112136e-06, + "clip_ratio/low_mean": 3.7791321346958284e-05, + "clip_ratio/low_min": 3.2110563097376144e-06, + "clip_ratio/region_mean": 4.429570503816649e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 7550.0, + "completions/mean_terminated_length": 7409.7783203125, + "completions/min_length": 1469.0, + "completions/min_terminated_length": 1469.0, + "entropy": 1.0384011715650558, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014879995724186301, + "learning_rate": 1e-05, + "loss": 0.0338, + "num_tokens": 198265589.0, + "reward": 0.3359375, + "reward_std": 0.24040167033672333, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468922615051, + "sampling/importance_sampling_ratio/min": 8.418659126618877e-05, + "sampling/sampling_logp_difference/max": 9.382474899291992, + "sampling/sampling_logp_difference/mean": 0.021503347903490067, + "step": 249 + }, + { + "clip_ratio/high_max": 1.3615457191917812e-05, + "clip_ratio/high_mean": 4.491880531531933e-06, + "clip_ratio/low_mean": 3.916533574965797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.365721684962409e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 8140.9140625, + "completions/mean_terminated_length": 7517.48779296875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.8718572407960892, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002340668346732855, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 199324938.0, + "reward": 0.453125, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.002325017238035798, + "sampling/sampling_logp_difference/max": 6.064027786254883, + "sampling/sampling_logp_difference/mean": 0.019466478377580643, + "step": 250 + }, + { + "clip_ratio/high_max": 2.2175697040438536e-05, + "clip_ratio/high_mean": 5.543924260109634e-06, + "clip_ratio/low_mean": 4.1318608055007644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.686253225827386e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16263.0, + "completions/mean_length": 6630.96875, + "completions/mean_terminated_length": 6396.896484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.7798146530985832, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001989356242120266, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 200189902.0, + "reward": 0.5625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474883079529, + "sampling/importance_sampling_ratio/min": 0.0003315774374641478, + "sampling/sampling_logp_difference/max": 8.011649131774902, + "sampling/sampling_logp_difference/mean": 0.01849902793765068, + "step": 251 + }, + { + "clip_ratio/high_max": 3.325706302348408e-06, + "clip_ratio/high_mean": 8.31426575587102e-07, + "clip_ratio/low_mean": 2.0285911205064622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.111733795118198e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15357.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 6582.203125, + "completions/mean_terminated_length": 6582.203125, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 1.0181676000356674, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002594445599243045, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 201052832.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.0003853558446280658, + "sampling/sampling_logp_difference/max": 7.8613433837890625, + "sampling/sampling_logp_difference/mean": 0.021598614752292633, + "step": 252 + }, + { + "clip_ratio/high_max": 2.2044430352252675e-05, + "clip_ratio/high_mean": 5.511107588063169e-06, + "clip_ratio/low_mean": 3.4155824209847196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96669319115972e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14540.0, + "completions/max_terminated_length": 14540.0, + "completions/mean_length": 6145.1796875, + "completions/mean_terminated_length": 6145.1796875, + "completions/min_length": 1098.0, + "completions/min_terminated_length": 1098.0, + "entropy": 0.9084350541234016, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003104996867477894, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 201858047.0, + "reward": 0.5078125, + "reward_std": 0.33220985531806946, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.007650630082935095, + "sampling/sampling_logp_difference/max": 4.87296724319458, + "sampling/sampling_logp_difference/mean": 0.018979094922542572, + "step": 253 + }, + { + "clip_ratio/high_max": 2.9959978519400465e-05, + "clip_ratio/high_mean": 7.489994629850116e-06, + "clip_ratio/low_mean": 3.5255963325653283e-05, + "clip_ratio/low_min": 2.973075879708631e-06, + "clip_ratio/region_mean": 4.274595892184152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15745.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7259.953125, + "completions/mean_terminated_length": 7259.953125, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 0.9823614731431007, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003212577663362026, + "learning_rate": 1e-05, + "loss": 0.0133, + "num_tokens": 202807673.0, + "reward": 0.4765625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999860405921936, + "sampling/importance_sampling_ratio/min": 0.000536504783667624, + "sampling/sampling_logp_difference/max": 7.530435085296631, + "sampling/sampling_logp_difference/mean": 0.021432969719171524, + "step": 254 + }, + { + "clip_ratio/high_max": 3.273996276220714e-05, + "clip_ratio/high_mean": 9.095591565255745e-06, + "clip_ratio/low_mean": 2.9539680099333054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8635271948805894e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7258.71875, + "completions/mean_terminated_length": 7113.87353515625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8823810070753098, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001418307889252901, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 203757333.0, + "reward": 0.40625, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884963035583, + "sampling/importance_sampling_ratio/min": 0.0006408974295482039, + "sampling/sampling_logp_difference/max": 7.3526411056518555, + "sampling/sampling_logp_difference/mean": 0.019296500831842422, + "step": 255 + }, + { + "clip_ratio/high_max": 1.544119368190877e-05, + "clip_ratio/high_mean": 3.860298420477193e-06, + "clip_ratio/low_mean": 3.755458698151415e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.141488631148604e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7011.40625, + "completions/mean_terminated_length": 6386.56689453125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8057166337966919, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001652427832596004, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 204675065.0, + "reward": 0.46875, + "reward_std": 0.24146251380443573, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.015319154597818851, + "sampling/sampling_logp_difference/max": 4.178651332855225, + "sampling/sampling_logp_difference/mean": 0.018787402659654617, + "step": 256 + }, + { + "clip_ratio/high_max": 5.222041181696113e-06, + "clip_ratio/high_mean": 2.209917965956265e-06, + "clip_ratio/low_mean": 4.0701652551433654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.291157006264257e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14796.0, + "completions/max_terminated_length": 14796.0, + "completions/mean_length": 6243.4296875, + "completions/mean_terminated_length": 6243.4296875, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "entropy": 0.9856048971414566, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001482579973526299, + "learning_rate": 1e-05, + "loss": 0.0677, + "num_tokens": 205494344.0, + "reward": 0.5390625, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998942613601685, + "sampling/importance_sampling_ratio/min": 0.0004254466330166906, + "sampling/sampling_logp_difference/max": 7.762371063232422, + "sampling/sampling_logp_difference/mean": 0.019727632403373718, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 6.842733455414418e-05, + "clip_ratio/low_min": 9.297655878981459e-06, + "clip_ratio/region_mean": 6.842733455414418e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 7122.2421875, + "completions/mean_terminated_length": 6586.4375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.8625433370471001, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002006452763453126, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 206428775.0, + "reward": 0.40625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338388442993, + "sampling/importance_sampling_ratio/min": 0.00010911409481195733, + "sampling/sampling_logp_difference/max": 9.123116493225098, + "sampling/sampling_logp_difference/mean": 0.01927522011101246, + "step": 258 + }, + { + "clip_ratio/high_max": 2.887607206503162e-05, + "clip_ratio/high_mean": 7.219018016257905e-06, + "clip_ratio/low_mean": 2.7790995090981596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.501001378936053e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15254.0, + "completions/mean_length": 7965.2734375, + "completions/mean_terminated_length": 7623.6826171875, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 1.0068430602550507, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0029176415409892797, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 207469586.0, + "reward": 0.3828125, + "reward_std": 0.2212003916501999, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998945593833923, + "sampling/importance_sampling_ratio/min": 4.06005028708023e-06, + "sampling/sampling_logp_difference/max": 12.414315223693848, + "sampling/sampling_logp_difference/mean": 0.02198987640440464, + "step": 259 + }, + { + "clip_ratio/high_max": 8.710998599781306e-06, + "clip_ratio/high_mean": 2.1777496499453264e-06, + "clip_ratio/low_mean": 4.1899779091636447e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407752874158177e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6329.4296875, + "completions/mean_terminated_length": 6169.83349609375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.9399363100528717, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019115234026685357, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 208300217.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000567436218262, + "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05, + "sampling/sampling_logp_difference/max": 10.749804496765137, + "sampling/sampling_logp_difference/mean": 0.020002204924821854, + "step": 260 + }, + { + "clip_ratio/high_max": 2.536784450057894e-05, + "clip_ratio/high_mean": 6.341961125144735e-06, + "clip_ratio/low_mean": 5.959111433639919e-05, + "clip_ratio/low_min": 1.1521060741870315e-05, + "clip_ratio/region_mean": 6.593307591629127e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15635.0, + "completions/mean_length": 6747.90625, + "completions/mean_terminated_length": 6594.95263671875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.9575144425034523, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003766207257285714, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 209181077.0, + "reward": 0.4375, + "reward_std": 0.3164137303829193, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313354492188, + "sampling/importance_sampling_ratio/min": 1.250743298442103e-05, + "sampling/sampling_logp_difference/max": 11.28918743133545, + "sampling/sampling_logp_difference/mean": 0.020067427307367325, + "step": 261 + }, + { + "clip_ratio/high_max": 2.0626074274332495e-05, + "clip_ratio/high_mean": 5.156518568583124e-06, + "clip_ratio/low_mean": 5.808068385704246e-05, + "clip_ratio/low_min": 1.0360539818066172e-05, + "clip_ratio/region_mean": 6.32372018571914e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 6426.6953125, + "completions/mean_terminated_length": 6348.29150390625, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.87480478733778, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002375675830990076, + "learning_rate": 1e-05, + "loss": 0.0752, + "num_tokens": 210023702.0, + "reward": 0.5078125, + "reward_std": 0.38900789618492126, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999383687973022, + "sampling/importance_sampling_ratio/min": 0.00024259372730739415, + "sampling/sampling_logp_difference/max": 8.324122428894043, + "sampling/sampling_logp_difference/mean": 0.018864646553993225, + "step": 262 + }, + { + "clip_ratio/high_max": 4.462851393327583e-06, + "clip_ratio/high_mean": 1.1157128483318957e-06, + "clip_ratio/low_mean": 3.8966268334661436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.008198141036701e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7223.1484375, + "completions/mean_terminated_length": 6927.63671875, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 1.0218688547611237, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016257674433290958, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 210969921.0, + "reward": 0.4609375, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 9.193710138788447e-05, + "sampling/sampling_logp_difference/max": 9.294405937194824, + "sampling/sampling_logp_difference/mean": 0.02119653858244419, + "step": 263 + }, + { + "clip_ratio/high_max": 1.2653464409595472e-05, + "clip_ratio/high_mean": 3.163366102398868e-06, + "clip_ratio/low_mean": 4.864477250521304e-05, + "clip_ratio/low_min": 8.641252861707471e-06, + "clip_ratio/region_mean": 5.1808138323394815e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15180.0, + "completions/max_terminated_length": 15180.0, + "completions/mean_length": 6974.0703125, + "completions/mean_terminated_length": 6974.0703125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9265539348125458, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023448490537703037, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 211884866.0, + "reward": 0.390625, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.0007677432149648666, + "sampling/sampling_logp_difference/max": 7.172055244445801, + "sampling/sampling_logp_difference/mean": 0.020384611561894417, + "step": 264 + }, + { + "clip_ratio/high_max": 1.1967917316724197e-05, + "clip_ratio/high_mean": 2.9919793291810493e-06, + "clip_ratio/low_mean": 3.179497366545547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.478695157355105e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 7247.2734375, + "completions/mean_terminated_length": 7027.9921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.9756898358464241, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003212807234376669, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 212833933.0, + "reward": 0.328125, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.001600456889718771, + "sampling/sampling_logp_difference/max": 6.437466144561768, + "sampling/sampling_logp_difference/mean": 0.0199666079133749, + "step": 265 + }, + { + "clip_ratio/high_max": 1.1404694760130951e-05, + "clip_ratio/high_mean": 3.887520392709121e-06, + "clip_ratio/low_mean": 4.0242122167910566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4129643583801226e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15575.0, + "completions/mean_length": 7773.9296875, + "completions/mean_terminated_length": 7423.9267578125, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "entropy": 0.9765531942248344, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019600428640842438, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 213848508.0, + "reward": 0.3984375, + "reward_std": 0.3129909336566925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 2.430168751743622e-05, + "sampling/sampling_logp_difference/max": 10.624964714050293, + "sampling/sampling_logp_difference/mean": 0.020565161481499672, + "step": 266 + }, + { + "clip_ratio/high_max": 6.725708999510971e-06, + "clip_ratio/high_mean": 1.6814272498777427e-06, + "clip_ratio/low_mean": 2.869901106805628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0380438261090603e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15583.0, + "completions/mean_length": 6722.5, + "completions/mean_terminated_length": 6569.14306640625, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "entropy": 0.9291529878973961, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014550165506079793, + "learning_rate": 1e-05, + "loss": 0.0235, + "num_tokens": 214731180.0, + "reward": 0.4921875, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.007400285452604294, + "sampling/sampling_logp_difference/max": 4.90623664855957, + "sampling/sampling_logp_difference/mean": 0.020057080313563347, + "step": 267 + }, + { + "clip_ratio/high_max": 1.8797170469042612e-05, + "clip_ratio/high_mean": 6.827749643889547e-06, + "clip_ratio/low_mean": 3.448591337473772e-05, + "clip_ratio/low_min": 4.687090040533803e-06, + "clip_ratio/region_mean": 4.1313662677566754e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15797.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 7001.8671875, + "completions/mean_terminated_length": 7001.8671875, + "completions/min_length": 930.0, + "completions/min_terminated_length": 930.0, + "entropy": 1.0746883526444435, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002483292715623975, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 215645819.0, + "reward": 0.3515625, + "reward_std": 0.32955142855644226, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05, + "sampling/sampling_logp_difference/max": 11.493552207946777, + "sampling/sampling_logp_difference/mean": 0.020808640867471695, + "step": 268 + }, + { + "clip_ratio/high_max": 8.748068921704544e-06, + "clip_ratio/high_mean": 2.187017230426136e-06, + "clip_ratio/low_mean": 8.762007928453386e-05, + "clip_ratio/low_min": 2.3698836685071e-05, + "clip_ratio/region_mean": 8.980709480965743e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14189.0, + "completions/mean_length": 6663.796875, + "completions/mean_terminated_length": 6509.50830078125, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 1148.0, + "entropy": 1.0000900849699974, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0015696679474785924, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 216519369.0, + "reward": 0.3671875, + "reward_std": 0.3214311897754669, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997707605361938, + "sampling/importance_sampling_ratio/min": 1.288027192458685e-06, + "sampling/sampling_logp_difference/max": 13.562398910522461, + "sampling/sampling_logp_difference/mean": 0.022182684391736984, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.567897690321843e-05, + "clip_ratio/low_min": 3.287224444648018e-06, + "clip_ratio/region_mean": 4.567897690321843e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 6978.7421875, + "completions/mean_terminated_length": 6829.45263671875, + "completions/min_length": 1661.0, + "completions/min_terminated_length": 1661.0, + "entropy": 1.0845019966363907, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003887100610882044, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 217432432.0, + "reward": 0.3671875, + "reward_std": 0.3124619722366333, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999902248382568, + "sampling/importance_sampling_ratio/min": 0.02168075367808342, + "sampling/sampling_logp_difference/max": 3.8313302993774414, + "sampling/sampling_logp_difference/mean": 0.02127157337963581, + "step": 270 + }, + { + "clip_ratio/high_max": 2.444328310957644e-05, + "clip_ratio/high_mean": 6.11082077739411e-06, + "clip_ratio/low_mean": 5.1527222922231886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7638043699625996e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 5903.5546875, + "completions/mean_terminated_length": 5652.0244140625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.8638224303722382, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002851828932762146, + "learning_rate": 1e-05, + "loss": 0.0771, + "num_tokens": 218208399.0, + "reward": 0.4453125, + "reward_std": 0.3713914752006531, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000553131103516, + "sampling/importance_sampling_ratio/min": 0.000626727007329464, + "sampling/sampling_logp_difference/max": 7.374999523162842, + "sampling/sampling_logp_difference/mean": 0.01880766451358795, + "step": 271 + }, + { + "clip_ratio/high_max": 8.474872856822913e-06, + "clip_ratio/high_mean": 2.118718214205728e-06, + "clip_ratio/low_mean": 2.5821682072546537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.794040096887329e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16060.0, + "completions/max_terminated_length": 16060.0, + "completions/mean_length": 5596.7109375, + "completions/mean_terminated_length": 5596.7109375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 1.1127397641539574, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018005800666287541, + "learning_rate": 1e-05, + "loss": 0.0075, + "num_tokens": 218944418.0, + "reward": 0.4375, + "reward_std": 0.29485049843788147, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000048875808716, + "sampling/importance_sampling_ratio/min": 0.01548748929053545, + "sampling/sampling_logp_difference/max": 4.167722702026367, + "sampling/sampling_logp_difference/mean": 0.02004322223365307, + "step": 272 + }, + { + "clip_ratio/high_max": 1.5034628631838132e-05, + "clip_ratio/high_mean": 4.925485768580984e-06, + "clip_ratio/low_mean": 3.539464648838475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.032013237065257e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7120.109375, + "completions/mean_terminated_length": 7047.16552734375, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "entropy": 1.0697019025683403, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022711476776748896, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 219875952.0, + "reward": 0.2734375, + "reward_std": 0.23751862347126007, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000036358833313, + "sampling/importance_sampling_ratio/min": 9.733050683280453e-05, + "sampling/sampling_logp_difference/max": 9.237398147583008, + "sampling/sampling_logp_difference/mean": 0.02110595628619194, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0558468147792155e-05, + "clip_ratio/high_mean": 2.6396170369480387e-06, + "clip_ratio/low_mean": 3.796903268948881e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.060864915800266e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7623.953125, + "completions/mean_terminated_length": 7484.9052734375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.8836525157094002, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002609838731586933, + "learning_rate": 1e-05, + "loss": 0.0563, + "num_tokens": 220871730.0, + "reward": 0.3046875, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999877214431763, + "sampling/importance_sampling_ratio/min": 0.0015448236372321844, + "sampling/sampling_logp_difference/max": 6.472845554351807, + "sampling/sampling_logp_difference/mean": 0.019322458654642105, + "step": 274 + }, + { + "clip_ratio/high_max": 1.144785210271948e-05, + "clip_ratio/high_mean": 2.86196302567987e-06, + "clip_ratio/low_mean": 5.795533934360719e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 6.081730361984228e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15557.0, + "completions/mean_length": 6778.71875, + "completions/mean_terminated_length": 6703.08642578125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8968989998102188, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.00395589042454958, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 221761214.0, + "reward": 0.4921875, + "reward_std": 0.4032142758369446, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000214576721191, + "sampling/importance_sampling_ratio/min": 0.0011724763317033648, + "sampling/sampling_logp_difference/max": 6.7486371994018555, + "sampling/sampling_logp_difference/mean": 0.018937086686491966, + "step": 275 + }, + { + "clip_ratio/high_max": 2.708495139813749e-05, + "clip_ratio/high_mean": 7.628764933542698e-06, + "clip_ratio/low_mean": 3.0297362627607072e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.792612744746293e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7319.2578125, + "completions/mean_terminated_length": 6794.85107421875, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.870811752974987, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002063714899122715, + "learning_rate": 1e-05, + "loss": 0.0271, + "num_tokens": 222719287.0, + "reward": 0.3203125, + "reward_std": 0.2835301160812378, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999525547027588, + "sampling/importance_sampling_ratio/min": 2.13631665246794e-05, + "sampling/sampling_logp_difference/max": 10.7538423538208, + "sampling/sampling_logp_difference/mean": 0.019336167722940445, + "step": 276 + }, + { + "clip_ratio/high_max": 3.860288416035473e-06, + "clip_ratio/high_mean": 9.650721040088683e-07, + "clip_ratio/low_mean": 2.303871349340625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4003785597415117e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6207.4140625, + "completions/mean_terminated_length": 5879.13671875, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.8348869979381561, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023463829420506954, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 223533372.0, + "reward": 0.4375, + "reward_std": 0.2359210103750229, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05, + "sampling/sampling_logp_difference/max": 10.749905586242676, + "sampling/sampling_logp_difference/mean": 0.018392907455563545, + "step": 277 + }, + { + "clip_ratio/high_max": 2.1441665467136772e-05, + "clip_ratio/high_mean": 5.360416366784193e-06, + "clip_ratio/low_mean": 5.504566888703266e-05, + "clip_ratio/low_min": 1.2581466762640048e-05, + "clip_ratio/region_mean": 6.040608514013002e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14713.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 6417.2109375, + "completions/mean_terminated_length": 6417.2109375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "entropy": 1.0232173576951027, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033652919810265303, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 224375711.0, + "reward": 0.390625, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999820590019226, + "sampling/importance_sampling_ratio/min": 0.0020559614058583975, + "sampling/sampling_logp_difference/max": 6.18701171875, + "sampling/sampling_logp_difference/mean": 0.020980924367904663, + "step": 278 + }, + { + "clip_ratio/high_max": 4.679544872487895e-06, + "clip_ratio/high_mean": 1.1698862181219738e-06, + "clip_ratio/low_mean": 2.818696702888701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9356853247008985e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15026.0, + "completions/max_terminated_length": 15026.0, + "completions/mean_length": 5275.9453125, + "completions/mean_terminated_length": 5275.9453125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.8563915193080902, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025473968125879765, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 225070992.0, + "reward": 0.703125, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.0010016229934990406, + "sampling/sampling_logp_difference/max": 6.906133651733398, + "sampling/sampling_logp_difference/mean": 0.018068701028823853, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1973035422415705e-05, + "clip_ratio/low_min": 6.267234766710317e-06, + "clip_ratio/region_mean": 4.1973035422415705e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 7693.984375, + "completions/mean_terminated_length": 7556.0478515625, + "completions/min_length": 1349.0, + "completions/min_terminated_length": 1349.0, + "entropy": 0.7832933664321899, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016663498245179653, + "learning_rate": 1e-05, + "loss": 0.0836, + "num_tokens": 226073822.0, + "reward": 0.421875, + "reward_std": 0.3227166533470154, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999273419380188, + "sampling/importance_sampling_ratio/min": 5.893720299354754e-06, + "sampling/sampling_logp_difference/max": 12.04162311553955, + "sampling/sampling_logp_difference/mean": 0.01851016655564308, + "step": 280 + }, + { + "clip_ratio/high_max": 1.304801662627142e-05, + "clip_ratio/high_mean": 3.262004156567855e-06, + "clip_ratio/low_mean": 3.7096169648975774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035817426029098e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15207.0, + "completions/mean_length": 6222.4609375, + "completions/mean_terminated_length": 6061.1669921875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8835120126605034, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021045261528342962, + "learning_rate": 1e-05, + "loss": 0.055, + "num_tokens": 226888577.0, + "reward": 0.5078125, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616742134094, + "sampling/importance_sampling_ratio/min": 5.688065698450373e-07, + "sampling/sampling_logp_difference/max": 14.379725456237793, + "sampling/sampling_logp_difference/mean": 0.018851105123758316, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1754828114571865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1754828114571865e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6371.625, + "completions/mean_terminated_length": 6131.328125, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.9026313945651054, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030448357574641705, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 227722025.0, + "reward": 0.515625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00016869648243300617, + "sampling/sampling_logp_difference/max": 8.687409400939941, + "sampling/sampling_logp_difference/mean": 0.018757576122879982, + "step": 282 + }, + { + "clip_ratio/high_max": 7.024085562079563e-06, + "clip_ratio/high_mean": 1.7560213905198907e-06, + "clip_ratio/low_mean": 3.379111592494155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5547137599678535e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15891.0, + "completions/mean_length": 7510.4921875, + "completions/mean_terminated_length": 7224.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 1.044313833117485, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019295766251161695, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 228703256.0, + "reward": 0.3046875, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.0002186153142247349, + "sampling/sampling_logp_difference/max": 8.428196907043457, + "sampling/sampling_logp_difference/mean": 0.02207346074283123, + "step": 283 + }, + { + "clip_ratio/high_max": 5.068321115686558e-06, + "clip_ratio/high_mean": 1.2670802789216395e-06, + "clip_ratio/low_mean": 3.7797102550030104e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9064182828951743e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 7594.140625, + "completions/mean_terminated_length": 7524.92919921875, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9706612005829811, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0017117204843088984, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 229697002.0, + "reward": 0.2734375, + "reward_std": 0.18649455904960632, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016212463379, + "sampling/importance_sampling_ratio/min": 0.00035400164779275656, + "sampling/sampling_logp_difference/max": 7.946208953857422, + "sampling/sampling_logp_difference/mean": 0.021097885444760323, + "step": 284 + }, + { + "clip_ratio/high_max": 1.5618601537426002e-05, + "clip_ratio/high_mean": 3.904650384356501e-06, + "clip_ratio/low_mean": 4.570582996166195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.961048034601845e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15562.0, + "completions/mean_length": 6888.9140625, + "completions/mean_terminated_length": 6738.19873046875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.9210037142038345, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025933689903467894, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 230598679.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.0007308972999453545, + "sampling/sampling_logp_difference/max": 7.221237659454346, + "sampling/sampling_logp_difference/mean": 0.01939917542040348, + "step": 285 + }, + { + "clip_ratio/high_max": 2.398964193162101e-05, + "clip_ratio/high_mean": 6.9283565835576155e-06, + "clip_ratio/low_mean": 4.821338916372042e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.514174608833855e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6433.640625, + "completions/mean_terminated_length": 6355.29150390625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 1.064419962465763, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019397985888645053, + "learning_rate": 1e-05, + "loss": 0.0841, + "num_tokens": 231440153.0, + "reward": 0.375, + "reward_std": 0.3451131582260132, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 0.019039930775761604, + "sampling/sampling_logp_difference/max": 3.961216926574707, + "sampling/sampling_logp_difference/mean": 0.021084938198328018, + "step": 286 + }, + { + "clip_ratio/high_max": 1.9223051822336856e-05, + "clip_ratio/high_mean": 6.997284344834043e-06, + "clip_ratio/low_mean": 5.4512621773028513e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.150990611786256e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14304.0, + "completions/mean_length": 5858.40625, + "completions/mean_terminated_length": 5691.33349609375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.8120778575539589, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002288782736286521, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 232209485.0, + "reward": 0.46875, + "reward_std": 0.36637401580810547, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 0.00017959839897230268, + "sampling/sampling_logp_difference/max": 8.624787330627441, + "sampling/sampling_logp_difference/mean": 0.019076552242040634, + "step": 287 + }, + { + "clip_ratio/high_max": 9.900939403451048e-06, + "clip_ratio/high_mean": 3.4680233511608094e-06, + "clip_ratio/low_mean": 1.8137742017643177e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1605765368803986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7088.4765625, + "completions/mean_terminated_length": 6710.609375, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9231890514492989, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001075367210432887, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 233133850.0, + "reward": 0.5078125, + "reward_std": 0.18383610248565674, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998681545257568, + "sampling/importance_sampling_ratio/min": 0.005257915705442429, + "sampling/sampling_logp_difference/max": 5.248020648956299, + "sampling/sampling_logp_difference/mean": 0.019140273332595825, + "step": 288 + }, + { + "clip_ratio/high_max": 8.648456969240215e-06, + "clip_ratio/high_mean": 2.1621142423100537e-06, + "clip_ratio/low_mean": 1.838804723774956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0550161480059614e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 6151.78125, + "completions/mean_terminated_length": 5906.20849609375, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.8585417941212654, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015517349820584059, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 233940718.0, + "reward": 0.46875, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000255107879639, + "sampling/importance_sampling_ratio/min": 7.617311348440126e-05, + "sampling/sampling_logp_difference/max": 9.482501983642578, + "sampling/sampling_logp_difference/mean": 0.019276250153779984, + "step": 289 + }, + { + "clip_ratio/high_max": 1.1416668485253467e-05, + "clip_ratio/high_mean": 3.7661499732166703e-06, + "clip_ratio/low_mean": 2.1342358195397537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5108507770710276e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15133.0, + "completions/mean_length": 7111.2578125, + "completions/mean_terminated_length": 6812.13671875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.9735362678766251, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036829947493970394, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 234872111.0, + "reward": 0.4296875, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999943971633911, + "sampling/importance_sampling_ratio/min": 0.0006535807042382658, + "sampling/sampling_logp_difference/max": 7.333044528961182, + "sampling/sampling_logp_difference/mean": 0.021356046199798584, + "step": 290 + }, + { + "clip_ratio/high_max": 2.2526005068357335e-05, + "clip_ratio/high_mean": 5.631501267089334e-06, + "clip_ratio/low_mean": 3.30086276107977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.864012808207917e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15995.0, + "completions/mean_length": 6787.671875, + "completions/mean_terminated_length": 6478.11279296875, + "completions/min_length": 1404.0, + "completions/min_terminated_length": 1404.0, + "entropy": 0.8856986835598946, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00234629912301898, + "learning_rate": 1e-05, + "loss": 0.0169, + "num_tokens": 235759149.0, + "reward": 0.5390625, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999296069145203, + "sampling/importance_sampling_ratio/min": 0.00035710117663256824, + "sampling/sampling_logp_difference/max": 7.937491416931152, + "sampling/sampling_logp_difference/mean": 0.01950475014746189, + "step": 291 + }, + { + "clip_ratio/high_max": 2.6025282068076194e-05, + "clip_ratio/high_mean": 6.5063205170190486e-06, + "clip_ratio/low_mean": 4.603358706845029e-05, + "clip_ratio/low_min": 4.53654638477019e-06, + "clip_ratio/region_mean": 5.253990843812062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15845.0, + "completions/mean_length": 6757.203125, + "completions/mean_terminated_length": 6604.39697265625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.9217840805649757, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034125701058655977, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 236643319.0, + "reward": 0.3515625, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 6.144329745438881e-06, + "sampling/sampling_logp_difference/max": 11.999980926513672, + "sampling/sampling_logp_difference/mean": 0.020774487406015396, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5210429246035346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5210429246035346e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 6504.4375, + "completions/mean_terminated_length": 6185.74169921875, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 1.126970261335373, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020905097480863333, + "learning_rate": 1e-05, + "loss": 0.0464, + "num_tokens": 237495351.0, + "reward": 0.25, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.0009940610034391284, + "sampling/sampling_logp_difference/max": 6.913712024688721, + "sampling/sampling_logp_difference/mean": 0.023218728601932526, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.5693222053414502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5693222053414502e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15888.0, + "completions/mean_length": 5702.4140625, + "completions/mean_terminated_length": 5446.05615234375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8803137242794037, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002073790645226836, + "learning_rate": 1e-05, + "loss": 0.0066, + "num_tokens": 238251852.0, + "reward": 0.5625, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000054955482483, + "sampling/importance_sampling_ratio/min": 0.016290459781885147, + "sampling/sampling_logp_difference/max": 4.117175579071045, + "sampling/sampling_logp_difference/mean": 0.0185186006128788, + "step": 294 + }, + { + "clip_ratio/high_max": 1.4213665508577833e-05, + "clip_ratio/high_mean": 4.4483959982244414e-06, + "clip_ratio/low_mean": 2.979715202400257e-05, + "clip_ratio/low_min": 4.1597336348786484e-06, + "clip_ratio/region_mean": 3.424554824960069e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 7176.2890625, + "completions/mean_terminated_length": 6801.99169921875, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9554997384548187, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002781527815386653, + "learning_rate": 1e-05, + "loss": 0.0908, + "num_tokens": 239189385.0, + "reward": 0.5078125, + "reward_std": 0.3634958863258362, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999794960021973, + "sampling/importance_sampling_ratio/min": 0.0018711343873292208, + "sampling/sampling_logp_difference/max": 6.281210422515869, + "sampling/sampling_logp_difference/mean": 0.020436719059944153, + "step": 295 + }, + { + "clip_ratio/high_max": 1.2612186310434481e-05, + "clip_ratio/high_mean": 5.171368570699997e-06, + "clip_ratio/low_mean": 4.8968343890010146e-05, + "clip_ratio/low_min": 4.0222671486844774e-06, + "clip_ratio/region_mean": 5.413971166490228e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16020.0, + "completions/mean_length": 7855.578125, + "completions/mean_terminated_length": 7651.2001953125, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9450526610016823, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003132987068966031, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 240217715.0, + "reward": 0.40625, + "reward_std": 0.28512775897979736, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253153800964, + "sampling/importance_sampling_ratio/min": 0.0011438478250056505, + "sampling/sampling_logp_difference/max": 6.773357391357422, + "sampling/sampling_logp_difference/mean": 0.021461743861436844, + "step": 296 + }, + { + "clip_ratio/high_max": 2.172341964978841e-05, + "clip_ratio/high_mean": 6.823271291978017e-06, + "clip_ratio/low_mean": 3.516899266742257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.199226441414794e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14060.0, + "completions/mean_length": 6240.265625, + "completions/mean_terminated_length": 5913.04833984375, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.8811023011803627, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028523094952106476, + "learning_rate": 1e-05, + "loss": 0.015, + "num_tokens": 241035133.0, + "reward": 0.484375, + "reward_std": 0.26143303513526917, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.0006931954412721097, + "sampling/sampling_logp_difference/max": 7.274198532104492, + "sampling/sampling_logp_difference/mean": 0.019493088126182556, + "step": 297 + }, + { + "clip_ratio/high_max": 1.2606601558218244e-05, + "clip_ratio/high_mean": 3.151650389554561e-06, + "clip_ratio/low_mean": 3.768150395444536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.08331545713736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6103.203125, + "completions/mean_terminated_length": 6022.251953125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8766692876815796, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026241440791636705, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 241836479.0, + "reward": 0.453125, + "reward_std": 0.32589423656463623, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925434589386, + "sampling/importance_sampling_ratio/min": 0.00012664205860346556, + "sampling/sampling_logp_difference/max": 8.974145889282227, + "sampling/sampling_logp_difference/mean": 0.01907728984951973, + "step": 298 + }, + { + "clip_ratio/high_max": 1.7400974911652156e-05, + "clip_ratio/high_mean": 4.350243727913039e-06, + "clip_ratio/low_mean": 4.527119426711579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962143839293276e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7711.0703125, + "completions/mean_terminated_length": 7573.4052734375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 1.0770929008722305, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003654222236946225, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 242844376.0, + "reward": 0.3359375, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999839067459106, + "sampling/importance_sampling_ratio/min": 0.0006267472635954618, + "sampling/sampling_logp_difference/max": 7.374967098236084, + "sampling/sampling_logp_difference/mean": 0.022012868896126747, + "step": 299 + }, + { + "clip_ratio/high_max": 1.4325163647299632e-05, + "clip_ratio/high_mean": 3.581290911824908e-06, + "clip_ratio/low_mean": 4.28195745598714e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6400865016948956e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 6616.5546875, + "completions/mean_terminated_length": 6539.6455078125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.8439916148781776, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029195898678153753, + "learning_rate": 1e-05, + "loss": 0.1094, + "num_tokens": 243708479.0, + "reward": 0.453125, + "reward_std": 0.3516485095024109, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 2.189194128732197e-05, + "sampling/sampling_logp_difference/max": 10.729392051696777, + "sampling/sampling_logp_difference/mean": 0.017992788925766945, + "step": 300 + }, + { + "clip_ratio/high_max": 1.848296233220026e-05, + "clip_ratio/high_mean": 4.620740583050065e-06, + "clip_ratio/low_mean": 5.01860952226707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.480683557834709e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15286.0, + "completions/mean_length": 6173.5234375, + "completions/mean_terminated_length": 6093.1259765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.8975192531943321, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017261393368244171, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 244515378.0, + "reward": 0.53125, + "reward_std": 0.3532412052154541, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999533891677856, + "sampling/importance_sampling_ratio/min": 0.000553854217287153, + "sampling/sampling_logp_difference/max": 7.4986090660095215, + "sampling/sampling_logp_difference/mean": 0.019458644092082977, + "step": 301 + }, + { + "clip_ratio/high_max": 4.114005332667148e-05, + "clip_ratio/high_mean": 1.2276760230633954e-05, + "clip_ratio/low_mean": 3.397437080820964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.625113024303573e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16032.0, + "completions/mean_length": 5640.90625, + "completions/mean_terminated_length": 5470.38134765625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.8833519890904427, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018768958980217576, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 245258318.0, + "reward": 0.4609375, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999516606330872, + "sampling/importance_sampling_ratio/min": 0.0071789538487792015, + "sampling/sampling_logp_difference/max": 4.936601638793945, + "sampling/sampling_logp_difference/mean": 0.019646335393190384, + "step": 302 + }, + { + "clip_ratio/high_max": 1.4196921938491869e-05, + "clip_ratio/high_mean": 4.514302474944998e-06, + "clip_ratio/low_mean": 4.4677519781544106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.919182129015098e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16378.0, + "completions/mean_length": 7840.5078125, + "completions/mean_terminated_length": 7564.9111328125, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.9772802665829659, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002617602702230215, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 246280663.0, + "reward": 0.328125, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0008982301224023104, + "sampling/sampling_logp_difference/max": 7.015084266662598, + "sampling/sampling_logp_difference/mean": 0.022171074524521828, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7621316146687604e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7621316146687604e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6316.1015625, + "completions/mean_terminated_length": 6074.47216796875, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.8542795851826668, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0011874900665134192, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 247107604.0, + "reward": 0.3828125, + "reward_std": 0.2227931022644043, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000126361846924, + "sampling/importance_sampling_ratio/min": 0.00015846268797758967, + "sampling/sampling_logp_difference/max": 8.749991416931152, + "sampling/sampling_logp_difference/mean": 0.018691308796405792, + "step": 304 + }, + { + "clip_ratio/high_max": 3.0959752166381804e-06, + "clip_ratio/high_mean": 7.739938041595451e-07, + "clip_ratio/low_mean": 6.0967123090449604e-05, + "clip_ratio/low_min": 2.711407751121442e-05, + "clip_ratio/region_mean": 6.17411176335736e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6568.171875, + "completions/mean_terminated_length": 6412.365234375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.9063890501856804, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002459619427099824, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 247967322.0, + "reward": 0.5, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998743534088135, + "sampling/importance_sampling_ratio/min": 0.012350871227681637, + "sampling/sampling_logp_difference/max": 4.394028663635254, + "sampling/sampling_logp_difference/mean": 0.020134467631578445, + "step": 305 + }, + { + "clip_ratio/high_max": 5.9507838159333915e-06, + "clip_ratio/high_mean": 1.4876959539833479e-06, + "clip_ratio/low_mean": 2.400908408617397e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.549678004015732e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15714.0, + "completions/mean_length": 8182.28125, + "completions/mean_terminated_length": 7635.50048828125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 1.0137704983353615, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016673406353220344, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 249031710.0, + "reward": 0.3359375, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 0.0008049134048633277, + "sampling/sampling_logp_difference/max": 7.1247758865356445, + "sampling/sampling_logp_difference/mean": 0.021704845130443573, + "step": 306 + }, + { + "clip_ratio/high_max": 1.4527202438330278e-05, + "clip_ratio/high_mean": 3.6318006095825695e-06, + "clip_ratio/low_mean": 3.1829216595724574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5461017205307144e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14502.0, + "completions/max_terminated_length": 14502.0, + "completions/mean_length": 6460.5703125, + "completions/mean_terminated_length": 6460.5703125, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 1.0418165400624275, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022682021372020245, + "learning_rate": 1e-05, + "loss": 0.0171, + "num_tokens": 249881047.0, + "reward": 0.359375, + "reward_std": 0.25566887855529785, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999744296073914, + "sampling/importance_sampling_ratio/min": 0.002809183904901147, + "sampling/sampling_logp_difference/max": 5.874861240386963, + "sampling/sampling_logp_difference/mean": 0.02204791083931923, + "step": 307 + }, + { + "clip_ratio/high_max": 9.222687367582694e-06, + "clip_ratio/high_mean": 4.125313353142701e-06, + "clip_ratio/low_mean": 4.836107154915226e-05, + "clip_ratio/low_min": 3.4611657611094415e-06, + "clip_ratio/region_mean": 5.248638444754761e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14754.0, + "completions/mean_length": 6846.3046875, + "completions/mean_terminated_length": 6694.9130859375, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "entropy": 0.9839218333363533, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002436346374452114, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 250773806.0, + "reward": 0.484375, + "reward_std": 0.34299150109291077, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 0.0257408544421196, + "sampling/sampling_logp_difference/max": 3.6596758365631104, + "sampling/sampling_logp_difference/mean": 0.02135510742664337, + "step": 308 + }, + { + "clip_ratio/high_max": 1.3327621218195418e-05, + "clip_ratio/high_mean": 3.3319053045488545e-06, + "clip_ratio/low_mean": 3.791964286392613e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1251548054788145e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15777.0, + "completions/mean_length": 6558.53125, + "completions/mean_terminated_length": 6241.58056640625, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.7833076938986778, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002015948062762618, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 251633074.0, + "reward": 0.46875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05, + "sampling/sampling_logp_difference/max": 9.874995231628418, + "sampling/sampling_logp_difference/mean": 0.017078280448913574, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.3865982686620555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3865982686620555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7626.390625, + "completions/mean_terminated_length": 7487.38134765625, + "completions/min_length": 1400.0, + "completions/min_terminated_length": 1400.0, + "entropy": 0.8946382254362106, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001098336186259985, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 252629300.0, + "reward": 0.3359375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 0.00021643216314259917, + "sampling/sampling_logp_difference/max": 8.438233375549316, + "sampling/sampling_logp_difference/mean": 0.01972624473273754, + "step": 310 + }, + { + "clip_ratio/high_max": 6.5777783220255515e-06, + "clip_ratio/high_mean": 1.6444445805063879e-06, + "clip_ratio/low_mean": 1.7658890669736138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9303335250242526e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15763.0, + "completions/mean_length": 5796.984375, + "completions/mean_terminated_length": 5713.6220703125, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.969724528491497, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003871417138725519, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 253389562.0, + "reward": 0.484375, + "reward_std": 0.23752351105213165, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998880624771118, + "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05, + "sampling/sampling_logp_difference/max": 10.624960899353027, + "sampling/sampling_logp_difference/mean": 0.019220752641558647, + "step": 311 + }, + { + "clip_ratio/high_max": 8.099077376755304e-06, + "clip_ratio/high_mean": 2.8300572125772305e-06, + "clip_ratio/low_mean": 3.2033483023496956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.486354006554393e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15769.0, + "completions/mean_length": 6938.5625, + "completions/mean_terminated_length": 6788.63525390625, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9812447279691696, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002257548039779067, + "learning_rate": 1e-05, + "loss": -0.0089, + "num_tokens": 254295858.0, + "reward": 0.4140625, + "reward_std": 0.2596206068992615, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000464916229248, + "sampling/importance_sampling_ratio/min": 0.0009388317703269422, + "sampling/sampling_logp_difference/max": 6.970874309539795, + "sampling/sampling_logp_difference/mean": 0.02080199122428894, + "step": 312 + }, + { + "clip_ratio/high_max": 4.441917553776875e-06, + "clip_ratio/high_mean": 1.1104793884442188e-06, + "clip_ratio/low_mean": 3.414505465570983e-05, + "clip_ratio/low_min": 3.790060873143375e-06, + "clip_ratio/region_mean": 3.5255534044154047e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15163.0, + "completions/mean_length": 6878.15625, + "completions/mean_terminated_length": 6650.01611328125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9106859937310219, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00420041661709547, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 255197110.0, + "reward": 0.421875, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999925494194031, + "sampling/importance_sampling_ratio/min": 0.015217061154544353, + "sampling/sampling_logp_difference/max": 4.185338020324707, + "sampling/sampling_logp_difference/mean": 0.02016574889421463, + "step": 313 + }, + { + "clip_ratio/high_max": 8.814751254249131e-06, + "clip_ratio/high_mean": 2.203687813562283e-06, + "clip_ratio/low_mean": 3.137724206681014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3580929766685585e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 6260.2578125, + "completions/mean_terminated_length": 6260.2578125, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.9523455575108528, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027907798066735268, + "learning_rate": 1e-05, + "loss": 0.0302, + "num_tokens": 256018935.0, + "reward": 0.421875, + "reward_std": 0.2659186124801636, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 7.485197420464829e-05, + "sampling/sampling_logp_difference/max": 9.499998092651367, + "sampling/sampling_logp_difference/mean": 0.0191945917904377, + "step": 314 + }, + { + "clip_ratio/high_max": 2.8685263259831117e-05, + "clip_ratio/high_mean": 7.171315814957779e-06, + "clip_ratio/low_mean": 2.780131131885355e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.497262770224552e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6202.828125, + "completions/mean_terminated_length": 6041.22265625, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.8513326346874237, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023744129575788975, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 256841129.0, + "reward": 0.5625, + "reward_std": 0.32407689094543457, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000146627426147, + "sampling/importance_sampling_ratio/min": 9.269781003240496e-06, + "sampling/sampling_logp_difference/max": 11.588750839233398, + "sampling/sampling_logp_difference/mean": 0.019519174471497536, + "step": 315 + }, + { + "clip_ratio/high_max": 1.6381697605538648e-05, + "clip_ratio/high_mean": 4.095424401384662e-06, + "clip_ratio/low_mean": 3.0394592840821133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.449001792432682e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16328.0, + "completions/mean_length": 8019.4609375, + "completions/mean_terminated_length": 7073.90380859375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.9211000874638557, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024705040268599987, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 257884188.0, + "reward": 0.3046875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999315738677979, + "sampling/importance_sampling_ratio/min": 0.016358470544219017, + "sampling/sampling_logp_difference/max": 4.113009452819824, + "sampling/sampling_logp_difference/mean": 0.01984308287501335, + "step": 316 + }, + { + "clip_ratio/high_max": 7.485402420570608e-06, + "clip_ratio/high_mean": 1.871350605142652e-06, + "clip_ratio/low_mean": 3.025547425750119e-05, + "clip_ratio/low_min": 2.697337095014518e-06, + "clip_ratio/region_mean": 3.212682509001752e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15210.0, + "completions/mean_length": 7257.6875, + "completions/mean_terminated_length": 7038.65625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.8801277950406075, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032848953269422054, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 258831852.0, + "reward": 0.4296875, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 0.00019848966621793807, + "sampling/sampling_logp_difference/max": 8.524773597717285, + "sampling/sampling_logp_difference/mean": 0.019743187353014946, + "step": 317 + }, + { + "clip_ratio/high_max": 1.52771035573096e-05, + "clip_ratio/high_mean": 3.8192758893274e-06, + "clip_ratio/low_mean": 3.605492440783564e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.987420052453672e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14903.0, + "completions/mean_length": 6042.84375, + "completions/mean_terminated_length": 5878.69873046875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.8792382404208183, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004201764706522226, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 259623512.0, + "reward": 0.640625, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998612403869629, + "sampling/importance_sampling_ratio/min": 0.00027811730979010463, + "sampling/sampling_logp_difference/max": 8.187467575073242, + "sampling/sampling_logp_difference/mean": 0.018901977688074112, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1642084397608414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1642084397608414e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 7667.6875, + "completions/mean_terminated_length": 7458.49658203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9096411988139153, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014557713875547051, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 260623928.0, + "reward": 0.3515625, + "reward_std": 0.22726887464523315, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0002615761768538505, + "sampling/sampling_logp_difference/max": 8.248785018920898, + "sampling/sampling_logp_difference/mean": 0.01979639381170273, + "step": 319 + }, + { + "clip_ratio/high_max": 2.36019068324822e-05, + "clip_ratio/high_mean": 5.90047670812055e-06, + "clip_ratio/low_mean": 2.704614530557592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2946622809504333e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15514.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 6428.8203125, + "completions/mean_terminated_length": 6428.8203125, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.9974069148302078, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028210312593728304, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 261465625.0, + "reward": 0.46875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000195503234863, + "sampling/importance_sampling_ratio/min": 0.001225265790708363, + "sampling/sampling_logp_difference/max": 6.704597473144531, + "sampling/sampling_logp_difference/mean": 0.021066997200250626, + "step": 320 + }, + { + "clip_ratio/high_max": 2.9634452857862925e-05, + "clip_ratio/high_mean": 7.408613214465731e-06, + "clip_ratio/low_mean": 3.7066520235384814e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.447513333616371e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15224.0, + "completions/mean_length": 5664.515625, + "completions/mean_terminated_length": 5580.1103515625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.9557281509041786, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024263609666377306, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 262208475.0, + "reward": 0.4765625, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.0001059407222783193, + "sampling/sampling_logp_difference/max": 9.152630805969238, + "sampling/sampling_logp_difference/mean": 0.01997508481144905, + "step": 321 + }, + { + "clip_ratio/high_max": 1.9527269159880234e-05, + "clip_ratio/high_mean": 5.685056066795369e-06, + "clip_ratio/low_mean": 4.980480150607036e-05, + "clip_ratio/low_min": 5.136423624207964e-06, + "clip_ratio/region_mean": 5.5489856435997353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15566.0, + "completions/mean_length": 6649.5390625, + "completions/mean_terminated_length": 6170.794921875, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "entropy": 0.9003193452954292, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025556792970746756, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 263078672.0, + "reward": 0.453125, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998431205749512, + "sampling/importance_sampling_ratio/min": 3.631301660789177e-05, + "sampling/sampling_logp_difference/max": 10.223334312438965, + "sampling/sampling_logp_difference/mean": 0.019613387063145638, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.1492368912513484e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.1492368912513484e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15190.0, + "completions/mean_length": 5819.4140625, + "completions/mean_terminated_length": 5478.62060546875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9234923645853996, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0008845282136462629, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 263843797.0, + "reward": 0.5390625, + "reward_std": 0.14913026988506317, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452233314514, + "sampling/importance_sampling_ratio/min": 0.06759586930274963, + "sampling/sampling_logp_difference/max": 2.6942083835601807, + "sampling/sampling_logp_difference/mean": 0.02007308602333069, + "step": 323 + }, + { + "clip_ratio/high_max": 1.1687909363899962e-05, + "clip_ratio/high_mean": 2.9219773409749905e-06, + "clip_ratio/low_mean": 2.420720869622528e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7129186207730527e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16033.0, + "completions/mean_length": 6952.96875, + "completions/mean_terminated_length": 6726.62451171875, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.8909401148557663, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001527746208012104, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 264751769.0, + "reward": 0.453125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999520778656006, + "sampling/importance_sampling_ratio/min": 0.000505264790263027, + "sampling/sampling_logp_difference/max": 7.590427875518799, + "sampling/sampling_logp_difference/mean": 0.019622590392827988, + "step": 324 + }, + { + "clip_ratio/high_max": 1.5079081094881985e-05, + "clip_ratio/high_mean": 4.600909505825257e-06, + "clip_ratio/low_mean": 5.333864191925386e-05, + "clip_ratio/low_min": 5.043169494456379e-06, + "clip_ratio/region_mean": 5.793955187982647e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15667.0, + "completions/mean_length": 8138.5234375, + "completions/mean_terminated_length": 7733.0078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.972789965569973, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003113618353381753, + "learning_rate": 1e-05, + "loss": 0.0771, + "num_tokens": 265810580.0, + "reward": 0.40625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998828172683716, + "sampling/importance_sampling_ratio/min": 9.312124404914357e-08, + "sampling/sampling_logp_difference/max": 16.189363479614258, + "sampling/sampling_logp_difference/mean": 0.02168515883386135, + "step": 325 + }, + { + "clip_ratio/high_max": 4.463807272259146e-06, + "clip_ratio/high_mean": 1.1159518180647865e-06, + "clip_ratio/low_mean": 3.45970811395091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571303295757389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16248.0, + "completions/mean_length": 7565.6015625, + "completions/mean_terminated_length": 7131.90966796875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.835600845515728, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0009589543915353715, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 266796097.0, + "reward": 0.5078125, + "reward_std": 0.16834920644760132, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0017039220547303557, + "sampling/sampling_logp_difference/max": 6.374822616577148, + "sampling/sampling_logp_difference/mean": 0.01885361596941948, + "step": 326 + }, + { + "clip_ratio/high_max": 2.260646033391822e-05, + "clip_ratio/high_mean": 5.651615083479555e-06, + "clip_ratio/low_mean": 5.806843591926736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.372005145749426e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 7124.0546875, + "completions/mean_terminated_length": 6668.64697265625, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.9041655585169792, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024741124361753464, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 267727528.0, + "reward": 0.4296875, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999866247177124, + "sampling/importance_sampling_ratio/min": 4.63160322397016e-05, + "sampling/sampling_logp_difference/max": 9.980022430419922, + "sampling/sampling_logp_difference/mean": 0.01998118683695793, + "step": 327 + }, + { + "clip_ratio/high_max": 1.7461054540035548e-05, + "clip_ratio/high_mean": 5.456775966194982e-06, + "clip_ratio/low_mean": 3.374219397755951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919897017112817e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14834.0, + "completions/mean_length": 6757.65625, + "completions/mean_terminated_length": 6681.8583984375, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 1.105302907526493, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002233455190435052, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 268610868.0, + "reward": 0.375, + "reward_std": 0.23857943713665009, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 3.3169128528243164e-06, + "sampling/sampling_logp_difference/max": 12.616476058959961, + "sampling/sampling_logp_difference/mean": 0.021600255742669106, + "step": 328 + }, + { + "clip_ratio/high_max": 1.7514204046165105e-05, + "clip_ratio/high_mean": 4.378551011541276e-06, + "clip_ratio/low_mean": 4.300070588669769e-05, + "clip_ratio/low_min": 3.6705330330732977e-06, + "clip_ratio/region_mean": 4.7379256784552126e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16147.0, + "completions/mean_length": 7522.5546875, + "completions/mean_terminated_length": 7381.8974609375, + "completions/min_length": 1390.0, + "completions/min_terminated_length": 1390.0, + "entropy": 1.0577925741672516, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017964976141229272, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 269594867.0, + "reward": 0.421875, + "reward_std": 0.28223684430122375, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999937891960144, + "sampling/importance_sampling_ratio/min": 0.002247168216854334, + "sampling/sampling_logp_difference/max": 6.098084449768066, + "sampling/sampling_logp_difference/mean": 0.021326296031475067, + "step": 329 + }, + { + "clip_ratio/high_max": 1.7011016097967513e-05, + "clip_ratio/high_mean": 4.252754024491878e-06, + "clip_ratio/low_mean": 2.5991578013417893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0244332265283447e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 6232.109375, + "completions/mean_terminated_length": 5904.62890625, + "completions/min_length": 1238.0, + "completions/min_terminated_length": 1238.0, + "entropy": 0.8473618850111961, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023369218688458204, + "learning_rate": 1e-05, + "loss": 0.0291, + "num_tokens": 270410785.0, + "reward": 0.6015625, + "reward_std": 0.23516449332237244, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000063180923462, + "sampling/importance_sampling_ratio/min": 0.00010575528722256422, + "sampling/sampling_logp_difference/max": 9.154382705688477, + "sampling/sampling_logp_difference/mean": 0.018453873693943024, + "step": 330 + }, + { + "clip_ratio/high_max": 1.2072427125531249e-05, + "clip_ratio/high_mean": 4.300789669287042e-06, + "clip_ratio/low_mean": 3.064826853460545e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4949058090205654e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14454.0, + "completions/max_terminated_length": 14454.0, + "completions/mean_length": 5847.0625, + "completions/mean_terminated_length": 5847.0625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.8186105340719223, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014558705734089017, + "learning_rate": 1e-05, + "loss": 0.0672, + "num_tokens": 271179113.0, + "reward": 0.5390625, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000067114830017, + "sampling/importance_sampling_ratio/min": 1.994453305087518e-05, + "sampling/sampling_logp_difference/max": 10.822555541992188, + "sampling/sampling_logp_difference/mean": 0.017629161477088928, + "step": 331 + }, + { + "clip_ratio/high_max": 3.204624090358266e-05, + "clip_ratio/high_mean": 8.719567063053546e-06, + "clip_ratio/low_mean": 5.131868192620459e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0038249102944974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16065.0, + "completions/mean_length": 6670.6015625, + "completions/mean_terminated_length": 6516.4208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9379853457212448, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002496426459401846, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 272054510.0, + "reward": 0.328125, + "reward_std": 0.29932624101638794, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998705387115479, + "sampling/importance_sampling_ratio/min": 0.00010894420120166615, + "sampling/sampling_logp_difference/max": 9.124674797058105, + "sampling/sampling_logp_difference/mean": 0.020175442099571228, + "step": 332 + }, + { + "clip_ratio/high_max": 1.1311959497106727e-05, + "clip_ratio/high_mean": 2.827989874276682e-06, + "clip_ratio/low_mean": 6.672416202491149e-05, + "clip_ratio/low_min": 4.344501576269977e-06, + "clip_ratio/region_mean": 6.955215212656185e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 6613.328125, + "completions/mean_terminated_length": 6613.328125, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "entropy": 1.0781218782067299, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028466631192713976, + "learning_rate": 1e-05, + "loss": 0.0257, + "num_tokens": 272920304.0, + "reward": 0.3359375, + "reward_std": 0.32089439034461975, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999476671218872, + "sampling/importance_sampling_ratio/min": 0.02985518053174019, + "sampling/sampling_logp_difference/max": 3.511396884918213, + "sampling/sampling_logp_difference/mean": 0.02250460349023342, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.3429964585375274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3429964585375274e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6632.78125, + "completions/mean_terminated_length": 6318.2255859375, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "entropy": 0.9595735669136047, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0027409526519477367, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 273789588.0, + "reward": 0.3671875, + "reward_std": 0.12863078713417053, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999409914016724, + "sampling/importance_sampling_ratio/min": 8.484355930704623e-05, + "sampling/sampling_logp_difference/max": 9.374701499938965, + "sampling/sampling_logp_difference/mean": 0.02000725269317627, + "step": 334 + }, + { + "clip_ratio/high_max": 1.0485138318472309e-05, + "clip_ratio/high_mean": 2.6212845796180773e-06, + "clip_ratio/low_mean": 6.270217818382662e-05, + "clip_ratio/low_min": 1.282997527596308e-05, + "clip_ratio/region_mean": 6.532346287713153e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15567.0, + "completions/mean_length": 8083.421875, + "completions/mean_terminated_length": 7884.20849609375, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.139024168252945, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001853835303336382, + "learning_rate": 1e-05, + "loss": 0.0521, + "num_tokens": 274843754.0, + "reward": 0.2734375, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961256980896, + "sampling/importance_sampling_ratio/min": 6.099340225773631e-06, + "sampling/sampling_logp_difference/max": 12.007329940795898, + "sampling/sampling_logp_difference/mean": 0.023757295683026314, + "step": 335 + }, + { + "clip_ratio/high_max": 6.558237146236934e-06, + "clip_ratio/high_mean": 1.6395592865592334e-06, + "clip_ratio/low_mean": 3.2649955073793535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.428951481510012e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16058.0, + "completions/max_terminated_length": 16058.0, + "completions/mean_length": 6932.6640625, + "completions/mean_terminated_length": 6932.6640625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.2969390451908112, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002049664966762066, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 275750023.0, + "reward": 0.21875, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000157356262207, + "sampling/importance_sampling_ratio/min": 5.287989188218489e-05, + "sampling/sampling_logp_difference/max": 9.847487449645996, + "sampling/sampling_logp_difference/mean": 0.021840902045369148, + "step": 336 + }, + { + "clip_ratio/high_max": 5.1826359594997484e-06, + "clip_ratio/high_mean": 1.2956589898749371e-06, + "clip_ratio/low_mean": 3.607215444390022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.736781377483567e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15213.0, + "completions/mean_length": 7630.65625, + "completions/mean_terminated_length": 7124.26416015625, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.959126852452755, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030745298136025667, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 276750011.0, + "reward": 0.3125, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 3.149233089061454e-05, + "sampling/sampling_logp_difference/max": 10.365766525268555, + "sampling/sampling_logp_difference/mean": 0.021394159644842148, + "step": 337 + }, + { + "clip_ratio/high_max": 6.921764679646003e-06, + "clip_ratio/high_mean": 2.5604765028219845e-06, + "clip_ratio/low_mean": 2.64957521380893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.905622847038103e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15907.0, + "completions/mean_length": 7383.2421875, + "completions/mean_terminated_length": 7240.37353515625, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 1.1512386053800583, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014476332580670714, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 277715450.0, + "reward": 0.4140625, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750256538391, + "sampling/importance_sampling_ratio/min": 4.5251621486386284e-05, + "sampling/sampling_logp_difference/max": 10.00327205657959, + "sampling/sampling_logp_difference/mean": 0.020672230049967766, + "step": 338 + }, + { + "clip_ratio/high_max": 3.7021679872850655e-06, + "clip_ratio/high_mean": 9.255419968212664e-07, + "clip_ratio/low_mean": 3.8645826748506806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.957136880217149e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14693.0, + "completions/mean_length": 5876.421875, + "completions/mean_terminated_length": 5793.68505859375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 1.0786077454686165, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018895689863711596, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 278491688.0, + "reward": 0.3984375, + "reward_std": 0.21146979928016663, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998824596405029, + "sampling/importance_sampling_ratio/min": 0.0007111100130714476, + "sampling/sampling_logp_difference/max": 7.248683452606201, + "sampling/sampling_logp_difference/mean": 0.020282316952943802, + "step": 339 + }, + { + "clip_ratio/high_max": 1.8740533050731756e-05, + "clip_ratio/high_mean": 4.685133262682939e-06, + "clip_ratio/low_mean": 2.9699310402975243e-05, + "clip_ratio/low_min": 4.435140454006614e-06, + "clip_ratio/region_mean": 3.4384443438284507e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14627.0, + "completions/mean_length": 7162.625, + "completions/mean_terminated_length": 6709.1142578125, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "entropy": 0.898807168006897, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002133915899321437, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 279427384.0, + "reward": 0.4453125, + "reward_std": 0.32142335176467896, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.004845126066356897, + "sampling/sampling_logp_difference/max": 5.329782009124756, + "sampling/sampling_logp_difference/mean": 0.019643021747469902, + "step": 340 + }, + { + "clip_ratio/high_max": 1.472241683586617e-05, + "clip_ratio/high_mean": 5.561973125622899e-06, + "clip_ratio/low_mean": 6.452910844245707e-05, + "clip_ratio/low_min": 9.302988473791629e-06, + "clip_ratio/region_mean": 7.009108327338254e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15965.0, + "completions/mean_length": 7072.3828125, + "completions/mean_terminated_length": 6999.06298828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8942967653274536, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023624920286238194, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 280352177.0, + "reward": 0.375, + "reward_std": 0.36637401580810547, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999604225158691, + "sampling/importance_sampling_ratio/min": 0.0008250995306298137, + "sampling/sampling_logp_difference/max": 7.100006580352783, + "sampling/sampling_logp_difference/mean": 0.020037520676851273, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.717265596809739e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.717265596809739e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16300.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6553.203125, + "completions/mean_terminated_length": 6553.203125, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.8765531405806541, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025228122249245644, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 281208411.0, + "reward": 0.40625, + "reward_std": 0.3390446603298187, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999656677246094, + "sampling/importance_sampling_ratio/min": 0.00030091358348727226, + "sampling/sampling_logp_difference/max": 8.108687400817871, + "sampling/sampling_logp_difference/mean": 0.018958289176225662, + "step": 342 + }, + { + "clip_ratio/high_max": 1.5562100998067763e-05, + "clip_ratio/high_mean": 3.890525249516941e-06, + "clip_ratio/low_mean": 6.593948137378902e-05, + "clip_ratio/low_min": 1.4238520634535234e-05, + "clip_ratio/region_mean": 6.983000685067964e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14916.0, + "completions/mean_length": 6489.40625, + "completions/mean_terminated_length": 6087.1865234375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8384068235754967, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003243578365072608, + "learning_rate": 1e-05, + "loss": 0.119, + "num_tokens": 282059863.0, + "reward": 0.515625, + "reward_std": 0.39689862728118896, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.00026549631729722023, + "sampling/sampling_logp_difference/max": 8.233909606933594, + "sampling/sampling_logp_difference/mean": 0.01820875145494938, + "step": 343 + }, + { + "clip_ratio/high_max": 4.114007424504962e-06, + "clip_ratio/high_mean": 1.0285018561262405e-06, + "clip_ratio/low_mean": 3.0735714062757324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.176421569150989e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15859.0, + "completions/max_terminated_length": 15859.0, + "completions/mean_length": 7148.7890625, + "completions/mean_terminated_length": 7148.7890625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0214989855885506, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027867467142641544, + "learning_rate": 1e-05, + "loss": 0.0445, + "num_tokens": 282994036.0, + "reward": 0.4921875, + "reward_std": 0.28511500358581543, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999145269393921, + "sampling/importance_sampling_ratio/min": 0.027774186804890633, + "sampling/sampling_logp_difference/max": 3.583648204803467, + "sampling/sampling_logp_difference/mean": 0.0217401385307312, + "step": 344 + }, + { + "clip_ratio/high_max": 1.6063933799159713e-05, + "clip_ratio/high_mean": 5.513276278179546e-06, + "clip_ratio/low_mean": 4.230772367463942e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.782100086231367e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16091.0, + "completions/max_terminated_length": 16091.0, + "completions/mean_length": 5532.1328125, + "completions/mean_terminated_length": 5532.1328125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.9303388148546219, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0024432060308754444, + "learning_rate": 1e-05, + "loss": 0.0251, + "num_tokens": 283723605.0, + "reward": 0.421875, + "reward_std": 0.38717782497406006, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.011936242692172527, + "sampling/sampling_logp_difference/max": 4.428175926208496, + "sampling/sampling_logp_difference/mean": 0.019281461834907532, + "step": 345 + }, + { + "clip_ratio/high_max": 6.218693215487292e-06, + "clip_ratio/high_mean": 1.554673303871823e-06, + "clip_ratio/low_mean": 1.5384349637770356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6939022600581666e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6830.09375, + "completions/mean_terminated_length": 6441.72314453125, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.9551377296447754, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031446516513824463, + "learning_rate": 1e-05, + "loss": -0.0037, + "num_tokens": 284617089.0, + "reward": 0.3671875, + "reward_std": 0.20911568403244019, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.0007485119276680052, + "sampling/sampling_logp_difference/max": 7.197423458099365, + "sampling/sampling_logp_difference/mean": 0.01985902711749077, + "step": 346 + }, + { + "clip_ratio/high_max": 7.772906428726856e-06, + "clip_ratio/high_mean": 2.8712697712762747e-06, + "clip_ratio/low_mean": 3.287052913947264e-05, + "clip_ratio/low_min": 2.789369091260596e-06, + "clip_ratio/region_mean": 3.574179936549626e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15613.0, + "completions/mean_length": 6557.3515625, + "completions/mean_terminated_length": 6401.37353515625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 1.0254710763692856, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024617225863039494, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 285475910.0, + "reward": 0.390625, + "reward_std": 0.2761683464050293, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.006225659977644682, + "sampling/sampling_logp_difference/max": 5.079075813293457, + "sampling/sampling_logp_difference/mean": 0.021138068288564682, + "step": 347 + }, + { + "clip_ratio/high_max": 1.0258745533064939e-05, + "clip_ratio/high_mean": 3.588538106669148e-06, + "clip_ratio/low_mean": 6.333507008093875e-05, + "clip_ratio/low_min": 4.415712737682043e-06, + "clip_ratio/region_mean": 6.692360875604209e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15851.0, + "completions/mean_length": 7379.140625, + "completions/mean_terminated_length": 7088.6611328125, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9518962875008583, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017496495274826884, + "learning_rate": 1e-05, + "loss": 0.0734, + "num_tokens": 286439696.0, + "reward": 0.390625, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999358654022217, + "sampling/importance_sampling_ratio/min": 0.006735759321600199, + "sampling/sampling_logp_difference/max": 5.000324726104736, + "sampling/sampling_logp_difference/mean": 0.021384600549936295, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.854056094747648e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.854056094747648e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 5960.140625, + "completions/mean_terminated_length": 5878.06298828125, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.9556702002882957, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013999518705531955, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 287226394.0, + "reward": 0.3515625, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 8.140038517012727e-06, + "sampling/sampling_logp_difference/max": 11.71871566772461, + "sampling/sampling_logp_difference/mean": 0.01937047764658928, + "step": 349 + }, + { + "clip_ratio/high_max": 8.395007171202451e-06, + "clip_ratio/high_mean": 2.0987517928006127e-06, + "clip_ratio/low_mean": 3.610323426528339e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.820198628545768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12561.0, + "completions/mean_length": 5387.546875, + "completions/mean_terminated_length": 5300.96044921875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.95712860673666, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228786565363407, + "learning_rate": 1e-05, + "loss": 0.0692, + "num_tokens": 287935952.0, + "reward": 0.5234375, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000360012054443, + "sampling/importance_sampling_ratio/min": 0.005966294556856155, + "sampling/sampling_logp_difference/max": 5.121629238128662, + "sampling/sampling_logp_difference/mean": 0.020441649481654167, + "step": 350 + }, + { + "clip_ratio/high_max": 1.2559637070808094e-05, + "clip_ratio/high_mean": 3.1399092677020235e-06, + "clip_ratio/low_mean": 2.673440690159623e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9874316624045605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 5799.625, + "completions/mean_terminated_length": 5716.283203125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9457403644919395, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0029834613669663668, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 288696000.0, + "reward": 0.4921875, + "reward_std": 0.3884710967540741, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999950528144836, + "sampling/importance_sampling_ratio/min": 0.0011352180736139417, + "sampling/sampling_logp_difference/max": 6.780930519104004, + "sampling/sampling_logp_difference/mean": 0.021189026534557343, + "step": 351 + }, + { + "clip_ratio/high_max": 6.2518756749341264e-06, + "clip_ratio/high_mean": 1.5629689187335316e-06, + "clip_ratio/low_mean": 3.849920358334202e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0062172047328204e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7058.6875, + "completions/mean_terminated_length": 6757.87060546875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "entropy": 0.8782663866877556, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002617151942104101, + "learning_rate": 1e-05, + "loss": 0.0874, + "num_tokens": 289618904.0, + "reward": 0.3515625, + "reward_std": 0.28353992104530334, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942779541016, + "sampling/importance_sampling_ratio/min": 0.001438659499399364, + "sampling/sampling_logp_difference/max": 6.54404354095459, + "sampling/sampling_logp_difference/mean": 0.019699860364198685, + "step": 352 + }, + { + "clip_ratio/high_max": 1.8079134861181956e-05, + "clip_ratio/high_mean": 4.519783715295489e-06, + "clip_ratio/low_mean": 6.639697721766424e-05, + "clip_ratio/low_min": 1.0295151696482208e-05, + "clip_ratio/region_mean": 7.091676206982811e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6609.953125, + "completions/mean_terminated_length": 6454.81005859375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.8895087689161301, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021503251045942307, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 290484378.0, + "reward": 0.3671875, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 5.448641240946017e-05, + "sampling/sampling_logp_difference/max": 9.817559242248535, + "sampling/sampling_logp_difference/mean": 0.0200796015560627, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 6.141278026916552e-05, + "clip_ratio/low_min": 1.333249815616e-05, + "clip_ratio/region_mean": 6.141278026916552e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7872.4921875, + "completions/mean_terminated_length": 7453.89306640625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.9183534607291222, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0023925534915179014, + "learning_rate": 1e-05, + "loss": 0.0895, + "num_tokens": 291512393.0, + "reward": 0.34375, + "reward_std": 0.3763991594314575, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0004287353658583015, + "sampling/sampling_logp_difference/max": 7.7546706199646, + "sampling/sampling_logp_difference/mean": 0.020358648151159286, + "step": 354 + }, + { + "clip_ratio/high_max": 1.0912609013757901e-05, + "clip_ratio/high_mean": 3.7178592720010784e-06, + "clip_ratio/low_mean": 1.995230707052542e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.367016588777915e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 6605.6640625, + "completions/mean_terminated_length": 6290.23388671875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.9602678120136261, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018709113355726004, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 292380390.0, + "reward": 0.515625, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999732375144958, + "sampling/importance_sampling_ratio/min": 6.221406168016586e-10, + "sampling/sampling_logp_difference/max": 21.19785499572754, + "sampling/sampling_logp_difference/mean": 0.02150166593492031, + "step": 355 + }, + { + "clip_ratio/high_max": 2.202200403189636e-05, + "clip_ratio/high_mean": 6.279054105107207e-06, + "clip_ratio/low_mean": 5.168271604816255e-05, + "clip_ratio/low_min": 7.731559890089557e-06, + "clip_ratio/region_mean": 5.796177038064343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13477.0, + "completions/max_terminated_length": 13477.0, + "completions/mean_length": 6677.8828125, + "completions/mean_terminated_length": 6677.8828125, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 1.001693107187748, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017649955116212368, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 293255287.0, + "reward": 0.3203125, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998878240585327, + "sampling/importance_sampling_ratio/min": 0.0027159738820046186, + "sampling/sampling_logp_difference/max": 5.908604621887207, + "sampling/sampling_logp_difference/mean": 0.020375655964016914, + "step": 356 + }, + { + "clip_ratio/high_max": 5.7686097534315195e-06, + "clip_ratio/high_mean": 2.223324372607749e-06, + "clip_ratio/low_mean": 2.7612236522145395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9835560894753144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6210.6953125, + "completions/mean_terminated_length": 6049.21484375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.9842480793595314, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024816791992634535, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 294069184.0, + "reward": 0.4140625, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000953674316406, + "sampling/importance_sampling_ratio/min": 0.0047831060364842415, + "sampling/sampling_logp_difference/max": 5.342665195465088, + "sampling/sampling_logp_difference/mean": 0.021009165793657303, + "step": 357 + }, + { + "clip_ratio/high_max": 5.0844009820139036e-06, + "clip_ratio/high_mean": 1.2711002455034759e-06, + "clip_ratio/low_mean": 4.299241186345171e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.426351074471313e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16085.0, + "completions/mean_length": 6876.0546875, + "completions/mean_terminated_length": 6725.13525390625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.8680268228054047, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030787813011556864, + "learning_rate": 1e-05, + "loss": 0.1096, + "num_tokens": 294969111.0, + "reward": 0.4921875, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999387264251709, + "sampling/importance_sampling_ratio/min": 0.0036591701209545135, + "sampling/sampling_logp_difference/max": 5.610518932342529, + "sampling/sampling_logp_difference/mean": 0.019419874995946884, + "step": 358 + }, + { + "clip_ratio/high_max": 5.279830929794116e-06, + "clip_ratio/high_mean": 1.319957732448529e-06, + "clip_ratio/low_mean": 3.3445195754211454e-05, + "clip_ratio/low_min": 3.1955414669937454e-06, + "clip_ratio/region_mean": 3.476515314559947e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16154.0, + "completions/mean_length": 7079.7734375, + "completions/mean_terminated_length": 6932.087890625, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 1.0033101588487625, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027940638829022646, + "learning_rate": 1e-05, + "loss": 0.1352, + "num_tokens": 295894682.0, + "reward": 0.4140625, + "reward_std": 0.40319663286209106, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999890923500061, + "sampling/importance_sampling_ratio/min": 0.00033553718822076917, + "sampling/sampling_logp_difference/max": 7.999777793884277, + "sampling/sampling_logp_difference/mean": 0.021608728915452957, + "step": 359 + }, + { + "clip_ratio/high_max": 4.0542295209888835e-06, + "clip_ratio/high_mean": 1.0135573802472209e-06, + "clip_ratio/low_mean": 3.935158406420669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0365141785514425e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14374.0, + "completions/mean_length": 6487.421875, + "completions/mean_terminated_length": 6249.904296875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.9404204189777374, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021709369029849768, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 296744216.0, + "reward": 0.4296875, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000174045562744, + "sampling/importance_sampling_ratio/min": 0.00012341697583906353, + "sampling/sampling_logp_difference/max": 8.9999418258667, + "sampling/sampling_logp_difference/mean": 0.02024281956255436, + "step": 360 + }, + { + "clip_ratio/high_max": 2.4414162908215076e-05, + "clip_ratio/high_mean": 6.103540727053769e-06, + "clip_ratio/low_mean": 2.0490186102506414e-05, + "clip_ratio/low_min": 2.8498473056970397e-06, + "clip_ratio/region_mean": 2.6593726602186507e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14584.0, + "completions/mean_length": 6668.1953125, + "completions/mean_terminated_length": 6273.24365234375, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.8671490699052811, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018110686214640737, + "learning_rate": 1e-05, + "loss": -0.0018, + "num_tokens": 297617937.0, + "reward": 0.4765625, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999983549118042, + "sampling/importance_sampling_ratio/min": 0.0003801324055530131, + "sampling/sampling_logp_difference/max": 7.874990940093994, + "sampling/sampling_logp_difference/mean": 0.01934785582125187, + "step": 361 + }, + { + "clip_ratio/high_max": 8.66071218297293e-06, + "clip_ratio/high_mean": 2.1651780457432324e-06, + "clip_ratio/low_mean": 2.4539695857583865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6704873903327098e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 8579.9921875, + "completions/mean_terminated_length": 7989.7734375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 1.0337364450097084, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014365602983161807, + "learning_rate": 1e-05, + "loss": 0.045, + "num_tokens": 298736304.0, + "reward": 0.1953125, + "reward_std": 0.1999218761920929, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999146461486816, + "sampling/importance_sampling_ratio/min": 0.0014037116197869182, + "sampling/sampling_logp_difference/max": 6.5686354637146, + "sampling/sampling_logp_difference/mean": 0.021067796275019646, + "step": 362 + }, + { + "clip_ratio/high_max": 7.748803682261496e-06, + "clip_ratio/high_mean": 1.937200920565374e-06, + "clip_ratio/low_mean": 5.063434127805522e-05, + "clip_ratio/low_min": 9.66116931522265e-06, + "clip_ratio/region_mean": 5.257154271021136e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16368.0, + "completions/mean_length": 7000.8203125, + "completions/mean_terminated_length": 6926.93701171875, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.8918163478374481, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003008107887580991, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 299653249.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999739527702332, + "sampling/importance_sampling_ratio/min": 0.002478980226442218, + "sampling/sampling_logp_difference/max": 5.999907970428467, + "sampling/sampling_logp_difference/mean": 0.020022090524435043, + "step": 363 + }, + { + "clip_ratio/high_max": 1.5043352505017538e-05, + "clip_ratio/high_mean": 3.7608381262543844e-06, + "clip_ratio/low_mean": 8.800596447144926e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2561434687086148e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16375.0, + "completions/max_terminated_length": 16375.0, + "completions/mean_length": 7319.578125, + "completions/mean_terminated_length": 7319.578125, + "completions/min_length": 1974.0, + "completions/min_terminated_length": 1974.0, + "entropy": 0.9145128801465034, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0010370119707658887, + "learning_rate": 1e-05, + "loss": 0.0138, + "num_tokens": 300608099.0, + "reward": 0.4609375, + "reward_std": 0.1412346363067627, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999158382415771, + "sampling/importance_sampling_ratio/min": 0.00012156071898061782, + "sampling/sampling_logp_difference/max": 9.015096664428711, + "sampling/sampling_logp_difference/mean": 0.019386455416679382, + "step": 364 + }, + { + "clip_ratio/high_max": 9.589830597178661e-06, + "clip_ratio/high_mean": 2.3974576492946653e-06, + "clip_ratio/low_mean": 2.2494899667435675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4892357714634272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 6956.90625, + "completions/mean_terminated_length": 6882.67724609375, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9679212644696236, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021569218952208757, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 301516535.0, + "reward": 0.4765625, + "reward_std": 0.23462772369384766, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.01621459797024727, + "sampling/sampling_logp_difference/max": 4.121843338012695, + "sampling/sampling_logp_difference/mean": 0.020638462156057358, + "step": 365 + }, + { + "clip_ratio/high_max": 1.1957331025769236e-05, + "clip_ratio/high_mean": 2.989332756442309e-06, + "clip_ratio/low_mean": 2.334770033485256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6337033204981708e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 6933.1953125, + "completions/mean_terminated_length": 6706.37646484375, + "completions/min_length": 979.0, + "completions/min_terminated_length": 979.0, + "entropy": 0.9610472694039345, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019900640472769737, + "learning_rate": 1e-05, + "loss": 0.0329, + "num_tokens": 302422120.0, + "reward": 0.4921875, + "reward_std": 0.22908620536327362, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 7.346414143682978e-09, + "sampling/sampling_logp_difference/max": 18.729053497314453, + "sampling/sampling_logp_difference/mean": 0.020782412961125374, + "step": 366 + }, + { + "clip_ratio/high_max": 1.6365190958822495e-05, + "clip_ratio/high_mean": 4.091297739705624e-06, + "clip_ratio/low_mean": 2.5385876426753384e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9477173825398495e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15799.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 6711.640625, + "completions/mean_terminated_length": 6711.640625, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "entropy": 0.8035724982619286, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001954294042661786, + "learning_rate": 1e-05, + "loss": 0.0264, + "num_tokens": 303299402.0, + "reward": 0.4765625, + "reward_std": 0.2856517732143402, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000114440917969, + "sampling/importance_sampling_ratio/min": 0.002623806707561016, + "sampling/sampling_logp_difference/max": 5.943129062652588, + "sampling/sampling_logp_difference/mean": 0.018188728019595146, + "step": 367 + }, + { + "clip_ratio/high_max": 8.633360948806512e-06, + "clip_ratio/high_mean": 2.158340237201628e-06, + "clip_ratio/low_mean": 3.7187305906627444e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9345645916455396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15980.0, + "completions/mean_length": 6977.890625, + "completions/mean_terminated_length": 6674.4677734375, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.9545647650957108, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022571857552975416, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 304210412.0, + "reward": 0.4375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645948410034, + "sampling/importance_sampling_ratio/min": 5.501153282239102e-06, + "sampling/sampling_logp_difference/max": 12.110552787780762, + "sampling/sampling_logp_difference/mean": 0.021196123212575912, + "step": 368 + }, + { + "clip_ratio/high_max": 1.2197504474897869e-05, + "clip_ratio/high_mean": 3.0493761187244672e-06, + "clip_ratio/low_mean": 2.7975384682576987e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1024760801301454e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16087.0, + "completions/mean_length": 5952.8359375, + "completions/mean_terminated_length": 5349.3798828125, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.846152663230896, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003541936632245779, + "learning_rate": 1e-05, + "loss": 0.0897, + "num_tokens": 304989015.0, + "reward": 0.4453125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998842477798462, + "sampling/importance_sampling_ratio/min": 0.0019083521328866482, + "sampling/sampling_logp_difference/max": 6.261515140533447, + "sampling/sampling_logp_difference/mean": 0.018978029489517212, + "step": 369 + }, + { + "clip_ratio/high_max": 1.1725882586688385e-05, + "clip_ratio/high_mean": 2.9314706466720963e-06, + "clip_ratio/low_mean": 6.290217379500973e-05, + "clip_ratio/low_min": 1.226112590302364e-05, + "clip_ratio/region_mean": 6.583364438483841e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16098.0, + "completions/mean_length": 7976.9296875, + "completions/mean_terminated_length": 7635.1787109375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.9827005565166473, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023713603150099516, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 306032054.0, + "reward": 0.3046875, + "reward_std": 0.2527809143066406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 3.2563195873080986e-07, + "sampling/sampling_logp_difference/max": 14.937498092651367, + "sampling/sampling_logp_difference/mean": 0.0217706598341465, + "step": 370 + }, + { + "clip_ratio/high_max": 2.3902987095425487e-05, + "clip_ratio/high_mean": 7.721868257704045e-06, + "clip_ratio/low_mean": 4.01184702241153e-05, + "clip_ratio/low_min": 1.341508686891757e-05, + "clip_ratio/region_mean": 4.784033922078379e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7117.8828125, + "completions/mean_terminated_length": 6895.49609375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 0.8897347301244736, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023132229689508677, + "learning_rate": 1e-05, + "loss": 0.162, + "num_tokens": 306960599.0, + "reward": 0.515625, + "reward_std": 0.34822866320610046, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999181032180786, + "sampling/importance_sampling_ratio/min": 0.0007341355667449534, + "sampling/sampling_logp_difference/max": 7.2168169021606445, + "sampling/sampling_logp_difference/mean": 0.018669119104743004, + "step": 371 + }, + { + "clip_ratio/high_max": 4.371240720502101e-06, + "clip_ratio/high_mean": 1.0928101801255252e-06, + "clip_ratio/low_mean": 4.9660218792269006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.075302897239453e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15995.0, + "completions/mean_length": 6971.0390625, + "completions/mean_terminated_length": 6745.12841796875, + "completions/min_length": 871.0, + "completions/min_terminated_length": 871.0, + "entropy": 1.0919678956270218, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030236958991736174, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 307873100.0, + "reward": 0.3359375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000091791152954, + "sampling/importance_sampling_ratio/min": 0.01082979142665863, + "sampling/sampling_logp_difference/max": 4.525454521179199, + "sampling/sampling_logp_difference/mean": 0.022024717181921005, + "step": 372 + }, + { + "clip_ratio/high_max": 4.341634394222638e-06, + "clip_ratio/high_mean": 1.0854085985556594e-06, + "clip_ratio/low_mean": 3.061858558339736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.170399429563986e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14442.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6897.66455078125, + "completions/min_length": 1685.0, + "completions/min_terminated_length": 1685.0, + "entropy": 1.0812252908945084, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018919071881100535, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 308804876.0, + "reward": 0.28125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696612358093, + "sampling/importance_sampling_ratio/min": 0.0011743507348001003, + "sampling/sampling_logp_difference/max": 6.747039794921875, + "sampling/sampling_logp_difference/mean": 0.022177904844284058, + "step": 373 + }, + { + "clip_ratio/high_max": 4.6198765630833805e-06, + "clip_ratio/high_mean": 1.1549691407708451e-06, + "clip_ratio/low_mean": 1.3996559573570266e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5151528714341111e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15814.0, + "completions/mean_length": 7344.5546875, + "completions/mean_terminated_length": 6977.09716796875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.9340410158038139, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001848200336098671, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 309762603.0, + "reward": 0.4296875, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999948143959045, + "sampling/importance_sampling_ratio/min": 0.0002964614541269839, + "sampling/sampling_logp_difference/max": 8.1235933303833, + "sampling/sampling_logp_difference/mean": 0.02034556306898594, + "step": 374 + }, + { + "clip_ratio/high_max": 1.3913735983805964e-05, + "clip_ratio/high_mean": 3.478433995951491e-06, + "clip_ratio/low_mean": 2.4544106395296694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8022539936500834e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15244.0, + "completions/max_terminated_length": 15244.0, + "completions/mean_length": 6615.6484375, + "completions/mean_terminated_length": 6615.6484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.971637412905693, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0012123315827921033, + "learning_rate": 1e-05, + "loss": 0.0581, + "num_tokens": 310628230.0, + "reward": 0.4296875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999868869781494, + "sampling/importance_sampling_ratio/min": 2.587145718280226e-05, + "sampling/sampling_logp_difference/max": 10.562370300292969, + "sampling/sampling_logp_difference/mean": 0.020877305418252945, + "step": 375 + }, + { + "clip_ratio/high_max": 6.119951194705209e-06, + "clip_ratio/high_mean": 1.5299877986763022e-06, + "clip_ratio/low_mean": 4.789722436271404e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.942721272982453e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16028.0, + "completions/mean_length": 6333.84375, + "completions/mean_terminated_length": 6009.64501953125, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.9569023698568344, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002646032487973571, + "learning_rate": 1e-05, + "loss": 0.086, + "num_tokens": 311457466.0, + "reward": 0.4453125, + "reward_std": 0.34928950667381287, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000247955322266, + "sampling/importance_sampling_ratio/min": 0.022760435938835144, + "sampling/sampling_logp_difference/max": 3.782731533050537, + "sampling/sampling_logp_difference/mean": 0.020464638248085976, + "step": 376 + }, + { + "clip_ratio/high_max": 1.8126566374121467e-05, + "clip_ratio/high_mean": 4.531641593530367e-06, + "clip_ratio/low_mean": 4.1024483266483e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5556124632639694e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15719.0, + "completions/mean_length": 6657.8515625, + "completions/mean_terminated_length": 6503.46875, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.029910758137703, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021437006071209908, + "learning_rate": 1e-05, + "loss": -0.0212, + "num_tokens": 312330879.0, + "reward": 0.4453125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000024437904358, + "sampling/importance_sampling_ratio/min": 0.020200612023472786, + "sampling/sampling_logp_difference/max": 3.9020423889160156, + "sampling/sampling_logp_difference/mean": 0.021411258727312088, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7961265118392475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7961265118392475e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16311.0, + "completions/mean_length": 7657.8359375, + "completions/mean_terminated_length": 7376.3466796875, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 0.9699486121535301, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018965511117130518, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 313331898.0, + "reward": 0.3515625, + "reward_std": 0.18884865939617157, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 7.867415661166888e-06, + "sampling/sampling_logp_difference/max": 11.75278091430664, + "sampling/sampling_logp_difference/mean": 0.021029409021139145, + "step": 378 + }, + { + "clip_ratio/high_max": 7.721664815107943e-06, + "clip_ratio/high_mean": 2.7168170504410227e-06, + "clip_ratio/low_mean": 4.313065619498957e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.584747375702136e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14619.0, + "completions/mean_length": 7085.3671875, + "completions/mean_terminated_length": 6937.77001953125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 1.0943557620048523, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016498853219673038, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 314258601.0, + "reward": 0.3203125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000105857849121, + "sampling/importance_sampling_ratio/min": 0.03447282314300537, + "sampling/sampling_logp_difference/max": 3.367583990097046, + "sampling/sampling_logp_difference/mean": 0.021414825692772865, + "step": 379 + }, + { + "clip_ratio/high_max": 7.953489330247976e-06, + "clip_ratio/high_mean": 1.988372332561994e-06, + "clip_ratio/low_mean": 3.479703536868328e-05, + "clip_ratio/low_min": 2.6767741019284585e-06, + "clip_ratio/region_mean": 3.6785407701245276e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 7614.1171875, + "completions/mean_terminated_length": 7182.81103515625, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "entropy": 0.9673903658986092, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001364902127534151, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 315256840.0, + "reward": 0.4296875, + "reward_std": 0.3503454327583313, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 6.874255632283166e-05, + "sampling/sampling_logp_difference/max": 9.585142135620117, + "sampling/sampling_logp_difference/mean": 0.02000460773706436, + "step": 380 + }, + { + "clip_ratio/high_max": 6.980824764468707e-06, + "clip_ratio/high_mean": 1.7452061911171768e-06, + "clip_ratio/low_mean": 4.410173994529032e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5846945681660145e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15637.0, + "completions/mean_length": 7766.9375, + "completions/mean_terminated_length": 7630.1591796875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 1.0277370810508728, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002171436557546258, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 316268976.0, + "reward": 0.34375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999735951423645, + "sampling/importance_sampling_ratio/min": 7.485197420464829e-05, + "sampling/sampling_logp_difference/max": 9.499998092651367, + "sampling/sampling_logp_difference/mean": 0.021251089870929718, + "step": 381 + }, + { + "clip_ratio/high_max": 9.843256520980503e-06, + "clip_ratio/high_mean": 3.5061395919910865e-06, + "clip_ratio/low_mean": 3.973216325903195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.323830307839671e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7774.2265625, + "completions/mean_terminated_length": 7567.59228515625, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "entropy": 1.0064171329140663, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013348929351195693, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 317285677.0, + "reward": 0.28125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999904632568359, + "sampling/importance_sampling_ratio/min": 1.7632934259381727e-06, + "sampling/sampling_logp_difference/max": 13.248327255249023, + "sampling/sampling_logp_difference/mean": 0.022232960909605026, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.2021426648043416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2021426648043416e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 6547.1796875, + "completions/mean_terminated_length": 6469.724609375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.9192209765315056, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002925361506640911, + "learning_rate": 1e-05, + "loss": 0.0809, + "num_tokens": 318148276.0, + "reward": 0.515625, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999691843986511, + "sampling/importance_sampling_ratio/min": 7.411971182591515e-06, + "sampling/sampling_logp_difference/max": 11.812414169311523, + "sampling/sampling_logp_difference/mean": 0.020470617339015007, + "step": 383 + }, + { + "clip_ratio/high_max": 1.543848429719219e-05, + "clip_ratio/high_mean": 3.8596210742980475e-06, + "clip_ratio/low_mean": 2.0332364726982632e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4191985573907004e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6904.40625, + "completions/mean_terminated_length": 6101.05078125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.9611739367246628, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002288331277668476, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 319052224.0, + "reward": 0.390625, + "reward_std": 0.23645779490470886, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999583959579468, + "sampling/importance_sampling_ratio/min": 1.0906596799031831e-05, + "sampling/sampling_logp_difference/max": 11.426142692565918, + "sampling/sampling_logp_difference/mean": 0.02049478143453598, + "step": 384 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 319052224, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-384/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/README.md b/dapo_lora_plus_20251202_001141/checkpoint-448/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/latest b/dapo_lora_plus_20251202_001141/checkpoint-448/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c83691d1f18f1aa59c0994e76f1e0d010c88273 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/latest @@ -0,0 +1 @@ +global_step448 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..144608798fd985252409e72e2ff77d3c5e6f92a1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/trainer_state.json @@ -0,0 +1,13922 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.41214351425942963, + "eval_steps": 500, + "global_step": 448, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + }, + { + "clip_ratio/high_max": 1.220810372615233e-05, + "clip_ratio/high_mean": 3.0520259315380827e-06, + "clip_ratio/low_mean": 4.339240456374682e-05, + "clip_ratio/low_min": 4.491233084991109e-06, + "clip_ratio/region_mean": 4.644443038159807e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 4807.765625, + "completions/mean_terminated_length": 4716.6142578125, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 1.045751042664051, + "epoch": 0.05979760809567617, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002512057079002261, + "learning_rate": 1e-05, + "loss": 0.003, + "num_tokens": 48096692.0, + "reward": 0.3671875, + "reward_std": 0.3435155153274536, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999058842658997, + "sampling/importance_sampling_ratio/min": 1.1480136890895665e-05, + "sampling/sampling_logp_difference/max": 11.374892234802246, + "sampling/sampling_logp_difference/mean": 0.01960371434688568, + "step": 65 + }, + { + "clip_ratio/high_max": 5.37941218681226e-06, + "clip_ratio/high_mean": 1.344853046703065e-06, + "clip_ratio/low_mean": 3.0161771633174794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1506624850408116e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16264.0, + "completions/mean_length": 6703.8359375, + "completions/mean_terminated_length": 6471.51220703125, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 1.0592866837978363, + "epoch": 0.06071757129714812, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016389708034694195, + "learning_rate": 1e-05, + "loss": -0.024, + "num_tokens": 48974399.0, + "reward": 0.2734375, + "reward_std": 0.2585548758506775, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999353885650635, + "sampling/importance_sampling_ratio/min": 7.4113349910476245e-06, + "sampling/sampling_logp_difference/max": 11.8125, + "sampling/sampling_logp_difference/mean": 0.020880095660686493, + "step": 66 + }, + { + "clip_ratio/high_max": 7.093600515872822e-06, + "clip_ratio/high_mean": 1.7734001289682055e-06, + "clip_ratio/low_mean": 4.470584758564655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.647924811251869e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16295.0, + "completions/mean_length": 6140.5078125, + "completions/mean_terminated_length": 5724.10546875, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 1.0998501181602478, + "epoch": 0.061637534498620056, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003946912474930286, + "learning_rate": 1e-05, + "loss": 0.0448, + "num_tokens": 49779920.0, + "reward": 0.34375, + "reward_std": 0.36796674132347107, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 2.849436668839189e-07, + "sampling/sampling_logp_difference/max": 15.070974349975586, + "sampling/sampling_logp_difference/mean": 0.021355850622057915, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.313956779038563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.313956779038563e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16352.0, + "completions/mean_length": 6689.8046875, + "completions/mean_terminated_length": 6213.04052734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.8561654165387154, + "epoch": 0.062557497700092, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0021656695753335953, + "learning_rate": 1e-05, + "loss": 0.0283, + "num_tokens": 50655023.0, + "reward": 0.203125, + "reward_std": 0.21723884344100952, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999941885471344, + "sampling/importance_sampling_ratio/min": 2.836359499269747e-06, + "sampling/sampling_logp_difference/max": 12.772989273071289, + "sampling/sampling_logp_difference/mean": 0.01873670145869255, + "step": 68 + }, + { + "clip_ratio/high_max": 2.3421607693308033e-05, + "clip_ratio/high_mean": 7.242933975248889e-06, + "clip_ratio/low_mean": 3.896083626386826e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.620377103492501e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14330.0, + "completions/max_terminated_length": 14330.0, + "completions/mean_length": 5707.0078125, + "completions/mean_terminated_length": 5707.0078125, + "completions/min_length": 625.0, + "completions/min_terminated_length": 625.0, + "entropy": 1.1396166533231735, + "epoch": 0.06347746090156393, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004121148493140936, + "learning_rate": 1e-05, + "loss": 0.0397, + "num_tokens": 51406536.0, + "reward": 0.3125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999328851699829, + "sampling/importance_sampling_ratio/min": 0.0005196487763896585, + "sampling/sampling_logp_difference/max": 7.562357425689697, + "sampling/sampling_logp_difference/mean": 0.020000409334897995, + "step": 69 + }, + { + "clip_ratio/high_max": 1.82290532393381e-05, + "clip_ratio/high_mean": 4.557263309834525e-06, + "clip_ratio/low_mean": 2.5275351731579576e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9832615496161452e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 5655.6328125, + "completions/mean_terminated_length": 5571.1572265625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.8928132206201553, + "epoch": 0.06439742410303588, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032538517843931913, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 52148473.0, + "reward": 0.3984375, + "reward_std": 0.29432642459869385, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.0017573959194123745, + "sampling/sampling_logp_difference/max": 6.343922138214111, + "sampling/sampling_logp_difference/mean": 0.018881790339946747, + "step": 70 + }, + { + "clip_ratio/high_max": 1.2836022506235167e-05, + "clip_ratio/high_mean": 3.209005626558792e-06, + "clip_ratio/low_mean": 3.8109637216621195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.131864307055366e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16323.0, + "completions/mean_length": 7399.7890625, + "completions/mean_terminated_length": 7034.5771484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.8808257132768631, + "epoch": 0.06531738730450783, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002061733277514577, + "learning_rate": 1e-05, + "loss": 0.0191, + "num_tokens": 53113230.0, + "reward": 0.3046875, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999673962593079, + "sampling/importance_sampling_ratio/min": 0.005283349193632603, + "sampling/sampling_logp_difference/max": 5.243195056915283, + "sampling/sampling_logp_difference/mean": 0.018456293269991875, + "step": 71 + }, + { + "clip_ratio/high_max": 1.5806871488166507e-05, + "clip_ratio/high_mean": 4.739466817227367e-06, + "clip_ratio/low_mean": 3.610486896832299e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.084433521711617e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 5730.9609375, + "completions/mean_terminated_length": 5475.2880859375, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.9486126750707626, + "epoch": 0.06623735050597976, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012298432411625981, + "learning_rate": 1e-05, + "loss": 0.0208, + "num_tokens": 53864049.0, + "reward": 0.359375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999348521232605, + "sampling/importance_sampling_ratio/min": 4.832820559386164e-05, + "sampling/sampling_logp_difference/max": 9.937495231628418, + "sampling/sampling_logp_difference/mean": 0.01919996738433838, + "step": 72 + }, + { + "clip_ratio/high_max": 1.2390134997986024e-05, + "clip_ratio/high_mean": 3.097533749496506e-06, + "clip_ratio/low_mean": 3.8867822581778455e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.19653564449618e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13500.0, + "completions/mean_length": 4620.5703125, + "completions/mean_terminated_length": 4527.94482421875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.9557560831308365, + "epoch": 0.0671573137074517, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002882040338590741, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 54473498.0, + "reward": 0.3984375, + "reward_std": 0.39294686913490295, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998915195465088, + "sampling/importance_sampling_ratio/min": 1.577107298089686e-07, + "sampling/sampling_logp_difference/max": 15.662503242492676, + "sampling/sampling_logp_difference/mean": 0.018525000661611557, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.088819471486204e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.088819471486204e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16314.0, + "completions/max_terminated_length": 16314.0, + "completions/mean_length": 5074.0703125, + "completions/mean_terminated_length": 5074.0703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8830869868397713, + "epoch": 0.06807727690892364, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003324020653963089, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 55141787.0, + "reward": 0.4609375, + "reward_std": 0.30115634202957153, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.0009876838885247707, + "sampling/sampling_logp_difference/max": 6.920147895812988, + "sampling/sampling_logp_difference/mean": 0.018072880804538727, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.526649884908693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.526649884908693e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15251.0, + "completions/max_terminated_length": 15251.0, + "completions/mean_length": 6192.1015625, + "completions/mean_terminated_length": 6192.1015625, + "completions/min_length": 553.0, + "completions/min_terminated_length": 553.0, + "entropy": 1.0888547226786613, + "epoch": 0.06899724011039558, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017452294705435634, + "learning_rate": 1e-05, + "loss": 0.0216, + "num_tokens": 55954144.0, + "reward": 0.2890625, + "reward_std": 0.23250606656074524, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473690986633, + "sampling/importance_sampling_ratio/min": 5.061922365712235e-07, + "sampling/sampling_logp_difference/max": 14.496349334716797, + "sampling/sampling_logp_difference/mean": 0.021221645176410675, + "step": 75 + }, + { + "clip_ratio/high_max": 1.6768677141953958e-05, + "clip_ratio/high_mean": 5.080836899651331e-06, + "clip_ratio/low_mean": 3.340929970363504e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.84901372854074e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6204.296875, + "completions/mean_terminated_length": 6124.1416015625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 1.0423575639724731, + "epoch": 0.06991720331186753, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0033357341308146715, + "learning_rate": 1e-05, + "loss": 0.1073, + "num_tokens": 56765470.0, + "reward": 0.3359375, + "reward_std": 0.37875816226005554, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99998539686203, + "sampling/importance_sampling_ratio/min": 4.564182381727733e-05, + "sampling/sampling_logp_difference/max": 9.994686126708984, + "sampling/sampling_logp_difference/mean": 0.01908688060939312, + "step": 76 + }, + { + "clip_ratio/high_max": 3.149884150843718e-06, + "clip_ratio/high_mean": 7.874710377109295e-07, + "clip_ratio/low_mean": 2.430614893000893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.509361991087644e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14409.0, + "completions/max_terminated_length": 14409.0, + "completions/mean_length": 5070.3125, + "completions/mean_terminated_length": 5070.3125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 1.0737399458885193, + "epoch": 0.07083716651333946, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0038695367984473705, + "learning_rate": 1e-05, + "loss": 0.0015, + "num_tokens": 57432958.0, + "reward": 0.390625, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 1.5509348259001854e-06, + "sampling/sampling_logp_difference/max": 13.376652717590332, + "sampling/sampling_logp_difference/mean": 0.01970684342086315, + "step": 77 + }, + { + "clip_ratio/high_max": 1.9821940441033803e-05, + "clip_ratio/high_mean": 4.955485110258451e-06, + "clip_ratio/low_mean": 2.9055729555693688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.401121466595214e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 5750.21875, + "completions/mean_terminated_length": 5495.00830078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.9708107560873032, + "epoch": 0.07175712971481141, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002927646040916443, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 58187426.0, + "reward": 0.296875, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999390840530396, + "sampling/importance_sampling_ratio/min": 0.015204614959657192, + "sampling/sampling_logp_difference/max": 4.186156272888184, + "sampling/sampling_logp_difference/mean": 0.019483914598822594, + "step": 78 + }, + { + "clip_ratio/high_max": 2.3815636723156786e-05, + "clip_ratio/high_mean": 5.953909180789196e-06, + "clip_ratio/low_mean": 4.989707144886779e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.585097960647545e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15938.0, + "completions/mean_length": 6067.484375, + "completions/mean_terminated_length": 5986.251953125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.9576351121068001, + "epoch": 0.07267709291628335, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0026169484481215477, + "learning_rate": 1e-05, + "loss": -0.0055, + "num_tokens": 58983336.0, + "reward": 0.390625, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620914459229, + "sampling/importance_sampling_ratio/min": 1.974713995878119e-06, + "sampling/sampling_logp_difference/max": 13.135087013244629, + "sampling/sampling_logp_difference/mean": 0.019007554277777672, + "step": 79 + }, + { + "clip_ratio/high_max": 2.4238934656750644e-05, + "clip_ratio/high_mean": 7.786730066072778e-06, + "clip_ratio/low_mean": 4.5700241571466904e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.3486972547034384e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13640.0, + "completions/max_terminated_length": 13640.0, + "completions/mean_length": 4612.8984375, + "completions/mean_terminated_length": 4612.8984375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.9636320173740387, + "epoch": 0.07359705611775529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0015429699560627341, + "learning_rate": 1e-05, + "loss": -0.018, + "num_tokens": 59590763.0, + "reward": 0.421875, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999473094940186, + "sampling/importance_sampling_ratio/min": 2.5909587364481013e-08, + "sampling/sampling_logp_difference/max": 17.468652725219727, + "sampling/sampling_logp_difference/mean": 0.019313856959342957, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0911465842109465e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0911465842109465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6101.3125, + "completions/mean_terminated_length": 5854.5283203125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.8831139355897903, + "epoch": 0.07451701931922723, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022505265660583973, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 60391283.0, + "reward": 0.3125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 0.0003816343960352242, + "sampling/sampling_logp_difference/max": 7.871047496795654, + "sampling/sampling_logp_difference/mean": 0.018377842381596565, + "step": 81 + }, + { + "clip_ratio/high_max": 1.547606643725885e-05, + "clip_ratio/high_mean": 3.869016609314713e-06, + "clip_ratio/low_mean": 2.478705800967873e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8656074391619768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14862.0, + "completions/mean_length": 4705.9921875, + "completions/mean_terminated_length": 4614.03955078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.9557913094758987, + "epoch": 0.07543698252069918, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002069958718493581, + "learning_rate": 1e-05, + "loss": -0.0015, + "num_tokens": 61021490.0, + "reward": 0.4296875, + "reward_std": 0.2637920379638672, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030232429504, + "sampling/importance_sampling_ratio/min": 2.76673017651774e-05, + "sampling/sampling_logp_difference/max": 10.495259284973145, + "sampling/sampling_logp_difference/mean": 0.018629569560289383, + "step": 82 + }, + { + "clip_ratio/high_max": 2.0910484636260662e-05, + "clip_ratio/high_mean": 5.2276211590651656e-06, + "clip_ratio/low_mean": 1.952954164607945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4757162805144617e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13745.0, + "completions/max_terminated_length": 13745.0, + "completions/mean_length": 5116.78125, + "completions/mean_terminated_length": 5116.78125, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.0198405236005783, + "epoch": 0.07635694572217111, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034461067989468575, + "learning_rate": 1e-05, + "loss": -0.0073, + "num_tokens": 61695382.0, + "reward": 0.265625, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999936819076538, + "sampling/importance_sampling_ratio/min": 0.012227212078869343, + "sampling/sampling_logp_difference/max": 4.4040913581848145, + "sampling/sampling_logp_difference/mean": 0.019400250166654587, + "step": 83 + }, + { + "clip_ratio/high_max": 1.5340228401328204e-05, + "clip_ratio/high_mean": 3.835057100332051e-06, + "clip_ratio/low_mean": 3.150914017169271e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.534419727202476e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 5891.9140625, + "completions/mean_terminated_length": 5553.45947265625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.9568078517913818, + "epoch": 0.07727690892364306, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025854657869786024, + "learning_rate": 1e-05, + "loss": 0.1013, + "num_tokens": 62474883.0, + "reward": 0.3203125, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001013278961182, + "sampling/importance_sampling_ratio/min": 0.0015072470996528864, + "sampling/sampling_logp_difference/max": 6.497470378875732, + "sampling/sampling_logp_difference/mean": 0.019574139267206192, + "step": 84 + }, + { + "clip_ratio/high_max": 1.108303422370227e-05, + "clip_ratio/high_mean": 2.7707585559255676e-06, + "clip_ratio/low_mean": 2.2325777763398946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5096536319324514e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13671.0, + "completions/mean_length": 5300.3359375, + "completions/mean_terminated_length": 5213.06298828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.9722280204296112, + "epoch": 0.078196872125115, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0025075653102248907, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 63172454.0, + "reward": 0.203125, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.00020346972451079637, + "sampling/sampling_logp_difference/max": 8.499993324279785, + "sampling/sampling_logp_difference/mean": 0.02002432942390442, + "step": 85 + }, + { + "clip_ratio/high_max": 1.3991947980684927e-05, + "clip_ratio/high_mean": 3.4979869951712317e-06, + "clip_ratio/low_mean": 4.893367201930232e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.243165958290774e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15617.0, + "completions/mean_length": 6364.21875, + "completions/mean_terminated_length": 6205.1748046875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 1.0607495978474617, + "epoch": 0.07911683532658693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017982006538659334, + "learning_rate": 1e-05, + "loss": -0.0117, + "num_tokens": 64007602.0, + "reward": 0.2890625, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 3.823801307589747e-05, + "sampling/sampling_logp_difference/max": 10.171680450439453, + "sampling/sampling_logp_difference/mean": 0.020373597741127014, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6416430046083406e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6416430046083406e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14709.0, + "completions/mean_length": 5746.3125, + "completions/mean_terminated_length": 5403.1611328125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.9913106113672256, + "epoch": 0.08003679852805888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002207317156717181, + "learning_rate": 1e-05, + "loss": 0.063, + "num_tokens": 64762058.0, + "reward": 0.34375, + "reward_std": 0.3264310359954834, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 5.3444750847120304e-08, + "sampling/sampling_logp_difference/max": 16.744617462158203, + "sampling/sampling_logp_difference/mean": 0.020608089864253998, + "step": 87 + }, + { + "clip_ratio/high_max": 1.2681661701208213e-05, + "clip_ratio/high_mean": 3.1704154253020533e-06, + "clip_ratio/low_mean": 3.541917828897567e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.85895939416514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 6088.5625, + "completions/mean_terminated_length": 5841.47216796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.9040444120764732, + "epoch": 0.08095676172953081, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0012974507408216596, + "learning_rate": 1e-05, + "loss": 0.0401, + "num_tokens": 65561002.0, + "reward": 0.3671875, + "reward_std": 0.2477683573961258, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998487234115601, + "sampling/importance_sampling_ratio/min": 6.021501121722395e-06, + "sampling/sampling_logp_difference/max": 12.020174026489258, + "sampling/sampling_logp_difference/mean": 0.01939838007092476, + "step": 88 + }, + { + "clip_ratio/high_max": 7.807132533343975e-06, + "clip_ratio/high_mean": 1.9517831333359936e-06, + "clip_ratio/low_mean": 1.8564539345788944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.05163223654381e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15021.0, + "completions/mean_length": 5765.5, + "completions/mean_terminated_length": 5510.65625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.9966336265206337, + "epoch": 0.08187672493100276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0013380619930103421, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 66318482.0, + "reward": 0.375, + "reward_std": 0.13994136452674866, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999471306800842, + "sampling/importance_sampling_ratio/min": 7.288413598871557e-06, + "sampling/sampling_logp_difference/max": 11.829224586486816, + "sampling/sampling_logp_difference/mean": 0.018109245225787163, + "step": 89 + }, + { + "clip_ratio/high_max": 1.7906912489706883e-05, + "clip_ratio/high_mean": 4.476728122426721e-06, + "clip_ratio/low_mean": 2.5812531305291486e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0289259655091882e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16120.0, + "completions/mean_length": 5462.78125, + "completions/mean_terminated_length": 5200.67236328125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "entropy": 0.9345141425728798, + "epoch": 0.0827966881324747, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023930128663778305, + "learning_rate": 1e-05, + "loss": 0.0475, + "num_tokens": 67038582.0, + "reward": 0.46875, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999513030052185, + "sampling/importance_sampling_ratio/min": 0.008508839644491673, + "sampling/sampling_logp_difference/max": 4.7666497230529785, + "sampling/sampling_logp_difference/mean": 0.019220296293497086, + "step": 90 + }, + { + "clip_ratio/high_max": 1.551389118503721e-05, + "clip_ratio/high_mean": 3.878472796259302e-06, + "clip_ratio/low_mean": 3.239646628117043e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6274939645863924e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15034.0, + "completions/max_terminated_length": 15034.0, + "completions/mean_length": 5547.5078125, + "completions/mean_terminated_length": 5547.5078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0511749312281609, + "epoch": 0.08371665133394664, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0013633714988827705, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 67774487.0, + "reward": 0.203125, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 1.0995515367540065e-05, + "sampling/sampling_logp_difference/max": 11.418023109436035, + "sampling/sampling_logp_difference/mean": 0.020328814163804054, + "step": 91 + }, + { + "clip_ratio/high_max": 1.5384989410449634e-05, + "clip_ratio/high_mean": 3.846247352612409e-06, + "clip_ratio/low_mean": 3.441604167164769e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.826228908110352e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5835.4140625, + "completions/mean_terminated_length": 5406.609375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 1.0024723336100578, + "epoch": 0.08463661453541858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0036165034398436546, + "learning_rate": 1e-05, + "loss": 0.0373, + "num_tokens": 68541660.0, + "reward": 0.34375, + "reward_std": 0.3584783673286438, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999669790267944, + "sampling/importance_sampling_ratio/min": 9.518130354990717e-06, + "sampling/sampling_logp_difference/max": 11.562312126159668, + "sampling/sampling_logp_difference/mean": 0.020469525828957558, + "step": 92 + }, + { + "clip_ratio/high_max": 6.105602551542688e-06, + "clip_ratio/high_mean": 1.526400637885672e-06, + "clip_ratio/low_mean": 5.3129634352444555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.46560352177039e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15695.0, + "completions/mean_length": 6252.609375, + "completions/mean_terminated_length": 6172.83447265625, + "completions/min_length": 481.0, + "completions/min_terminated_length": 481.0, + "entropy": 1.0325519517064095, + "epoch": 0.08555657773689053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022011541295796633, + "learning_rate": 1e-05, + "loss": 0.036, + "num_tokens": 69365418.0, + "reward": 0.3828125, + "reward_std": 0.32301604747772217, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998809099197388, + "sampling/importance_sampling_ratio/min": 0.0005531083443202078, + "sampling/sampling_logp_difference/max": 7.4999566078186035, + "sampling/sampling_logp_difference/mean": 0.02079072594642639, + "step": 93 + }, + { + "clip_ratio/high_max": 4.348128641140647e-06, + "clip_ratio/high_mean": 1.0870321602851618e-06, + "clip_ratio/low_mean": 3.0097819148977578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.118485085451539e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15316.0, + "completions/max_terminated_length": 15316.0, + "completions/mean_length": 5581.484375, + "completions/mean_terminated_length": 5581.484375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9222500994801521, + "epoch": 0.08647654093836246, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002300912281498313, + "learning_rate": 1e-05, + "loss": -0.0007, + "num_tokens": 70099320.0, + "reward": 0.296875, + "reward_std": 0.2959064245223999, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998577833175659, + "sampling/importance_sampling_ratio/min": 8.140386853483506e-08, + "sampling/sampling_logp_difference/max": 16.323843002319336, + "sampling/sampling_logp_difference/mean": 0.01952272653579712, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5122252029395895e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5122252029395895e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15781.0, + "completions/max_terminated_length": 15781.0, + "completions/mean_length": 5424.140625, + "completions/mean_terminated_length": 5424.140625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 1.0446564108133316, + "epoch": 0.08739650413983441, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016312639927491546, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 70811474.0, + "reward": 0.359375, + "reward_std": 0.31246691942214966, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000094175338745, + "sampling/importance_sampling_ratio/min": 0.0021919538266956806, + "sampling/sampling_logp_difference/max": 6.12296199798584, + "sampling/sampling_logp_difference/mean": 0.019741754978895187, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0354576261306647e-05, + "clip_ratio/high_mean": 3.496124691082514e-06, + "clip_ratio/low_mean": 4.096481598026003e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.446094089871622e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15755.0, + "completions/max_terminated_length": 15755.0, + "completions/mean_length": 5884.9609375, + "completions/mean_terminated_length": 5884.9609375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.9605691060423851, + "epoch": 0.08831646734130635, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032865386456251144, + "learning_rate": 1e-05, + "loss": 0.0451, + "num_tokens": 71582701.0, + "reward": 0.4140625, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999833106994629, + "sampling/importance_sampling_ratio/min": 1.149311810877407e-05, + "sampling/sampling_logp_difference/max": 11.373762130737305, + "sampling/sampling_logp_difference/mean": 0.019438734278082848, + "step": 96 + }, + { + "clip_ratio/high_max": 1.026998006636859e-05, + "clip_ratio/high_mean": 2.5674950165921473e-06, + "clip_ratio/low_mean": 3.5440503552308655e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8007998455213965e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15361.0, + "completions/max_terminated_length": 15361.0, + "completions/mean_length": 4835.09375, + "completions/mean_terminated_length": 4835.09375, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "entropy": 0.9038172215223312, + "epoch": 0.08923643054277829, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004721678793430328, + "learning_rate": 1e-05, + "loss": 0.1143, + "num_tokens": 72220025.0, + "reward": 0.4765625, + "reward_std": 0.38481879234313965, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 2.710051205667696e-07, + "sampling/sampling_logp_difference/max": 15.12112808227539, + "sampling/sampling_logp_difference/mean": 0.017888439819216728, + "step": 97 + }, + { + "clip_ratio/high_max": 2.93432283342554e-05, + "clip_ratio/high_mean": 9.56252398509605e-06, + "clip_ratio/low_mean": 4.7865792453194445e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.742831808674964e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14431.0, + "completions/mean_length": 5979.078125, + "completions/mean_terminated_length": 5897.1494140625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 1.0227951630949974, + "epoch": 0.09015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0010532280430197716, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 73005515.0, + "reward": 0.2890625, + "reward_std": 0.30115631222724915, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999090433120728, + "sampling/importance_sampling_ratio/min": 0.00030157779110595584, + "sampling/sampling_logp_difference/max": 8.10648250579834, + "sampling/sampling_logp_difference/mean": 0.019633149728178978, + "step": 98 + }, + { + "clip_ratio/high_max": 4.203234766464448e-06, + "clip_ratio/high_mean": 1.050808691616112e-06, + "clip_ratio/low_mean": 2.5574990331733716e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6625799137036665e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15886.0, + "completions/max_terminated_length": 15886.0, + "completions/mean_length": 4292.1796875, + "completions/mean_terminated_length": 4292.1796875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.8719984591007233, + "epoch": 0.09107635694572216, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0038324075285345316, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 73572794.0, + "reward": 0.4375, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999188780784607, + "sampling/importance_sampling_ratio/min": 0.015675775706768036, + "sampling/sampling_logp_difference/max": 4.155638694763184, + "sampling/sampling_logp_difference/mean": 0.018074234947562218, + "step": 99 + }, + { + "clip_ratio/high_max": 4.431366960488958e-06, + "clip_ratio/high_mean": 1.1078417401222396e-06, + "clip_ratio/low_mean": 4.433405501913512e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.54418968729442e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14674.0, + "completions/max_terminated_length": 14674.0, + "completions/mean_length": 5449.2890625, + "completions/mean_terminated_length": 5449.2890625, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9137986451387405, + "epoch": 0.09199632014719411, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004843447357416153, + "learning_rate": 1e-05, + "loss": 0.0166, + "num_tokens": 74289607.0, + "reward": 0.5, + "reward_std": 0.40609243512153625, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999977707862854, + "sampling/importance_sampling_ratio/min": 8.851584993863071e-07, + "sampling/sampling_logp_difference/max": 13.937499046325684, + "sampling/sampling_logp_difference/mean": 0.018183842301368713, + "step": 100 + }, + { + "clip_ratio/high_max": 8.212076863856055e-06, + "clip_ratio/high_mean": 2.0530192159640137e-06, + "clip_ratio/low_mean": 3.6279372466196946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.833239122741361e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16163.0, + "completions/max_terminated_length": 16163.0, + "completions/mean_length": 4983.3515625, + "completions/mean_terminated_length": 4983.3515625, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.9354705810546875, + "epoch": 0.09291628334866606, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037651765160262585, + "learning_rate": 1e-05, + "loss": 0.0463, + "num_tokens": 74946484.0, + "reward": 0.3671875, + "reward_std": 0.3090519309043884, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.00011593531962716952, + "sampling/sampling_logp_difference/max": 9.062478065490723, + "sampling/sampling_logp_difference/mean": 0.018207306042313576, + "step": 101 + }, + { + "clip_ratio/high_max": 1.3182888324081432e-05, + "clip_ratio/high_mean": 3.295722081020358e-06, + "clip_ratio/low_mean": 2.544108633628639e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8736808644680423e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16039.0, + "completions/mean_length": 6351.1015625, + "completions/mean_terminated_length": 6027.45947265625, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 0.9310042560100555, + "epoch": 0.09383624655013799, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0009160125628113747, + "learning_rate": 1e-05, + "loss": -0.023, + "num_tokens": 75779145.0, + "reward": 0.3828125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998877048492432, + "sampling/importance_sampling_ratio/min": 0.0002961359277833253, + "sampling/sampling_logp_difference/max": 8.1246919631958, + "sampling/sampling_logp_difference/mean": 0.018513178452849388, + "step": 102 + }, + { + "clip_ratio/high_max": 1.1402620202716207e-05, + "clip_ratio/high_mean": 3.935649147024378e-06, + "clip_ratio/low_mean": 3.059757568735222e-05, + "clip_ratio/low_min": 4.3258582991256844e-06, + "clip_ratio/region_mean": 3.45332257438713e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14471.0, + "completions/mean_length": 5293.40625, + "completions/mean_terminated_length": 4935.64501953125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 1.0732879787683487, + "epoch": 0.09475620975160993, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023993055801838636, + "learning_rate": 1e-05, + "loss": 0.1021, + "num_tokens": 76475557.0, + "reward": 0.34375, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 6.613240111619234e-05, + "sampling/sampling_logp_difference/max": 9.623851776123047, + "sampling/sampling_logp_difference/mean": 0.020792219787836075, + "step": 103 + }, + { + "clip_ratio/high_max": 2.130644793396641e-05, + "clip_ratio/high_mean": 8.929533635182452e-06, + "clip_ratio/low_mean": 2.663600798769039e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.556554071337814e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16305.0, + "completions/mean_length": 7619.7578125, + "completions/mean_terminated_length": 7409.41650390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9646238535642624, + "epoch": 0.09567617295308188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0014872358879074454, + "learning_rate": 1e-05, + "loss": 0.0439, + "num_tokens": 77474310.0, + "reward": 0.34375, + "reward_std": 0.33114904165267944, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999638795852661, + "sampling/importance_sampling_ratio/min": 0.0016686831368133426, + "sampling/sampling_logp_difference/max": 6.395720481872559, + "sampling/sampling_logp_difference/mean": 0.020074717700481415, + "step": 104 + }, + { + "clip_ratio/high_max": 1.7765815300663235e-05, + "clip_ratio/high_mean": 5.154013138053415e-06, + "clip_ratio/low_mean": 5.166909659237717e-05, + "clip_ratio/low_min": 8.365680514543783e-06, + "clip_ratio/region_mean": 5.68231100714911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15984.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 5959.921875, + "completions/mean_terminated_length": 5959.921875, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 1.004471093416214, + "epoch": 0.09659613615455381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00398358516395092, + "learning_rate": 1e-05, + "loss": 0.1016, + "num_tokens": 78257132.0, + "reward": 0.359375, + "reward_std": 0.3653082847595215, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000170469284058, + "sampling/importance_sampling_ratio/min": 0.0030075267422944307, + "sampling/sampling_logp_difference/max": 5.806637287139893, + "sampling/sampling_logp_difference/mean": 0.020755283534526825, + "step": 105 + }, + { + "clip_ratio/high_max": 1.6946955838648137e-05, + "clip_ratio/high_mean": 4.236738959662034e-06, + "clip_ratio/low_mean": 4.510891039899434e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.934564867653535e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13736.0, + "completions/mean_length": 5427.03125, + "completions/mean_terminated_length": 5340.755859375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9117375314235687, + "epoch": 0.09751609935602576, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019883522763848305, + "learning_rate": 1e-05, + "loss": 0.01, + "num_tokens": 78971072.0, + "reward": 0.375, + "reward_std": 0.31694266200065613, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.0008046010043472052, + "sampling/sampling_logp_difference/max": 7.125164031982422, + "sampling/sampling_logp_difference/mean": 0.018812140449881554, + "step": 106 + }, + { + "clip_ratio/high_max": 2.968176841022796e-05, + "clip_ratio/high_mean": 7.42044210255699e-06, + "clip_ratio/low_mean": 3.220799408154562e-05, + "clip_ratio/low_min": 5.315981979947537e-06, + "clip_ratio/region_mean": 3.962843629778945e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16293.0, + "completions/max_terminated_length": 16293.0, + "completions/mean_length": 6062.078125, + "completions/mean_terminated_length": 6062.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 1.0164100378751755, + "epoch": 0.0984360625574977, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00450351694598794, + "learning_rate": 1e-05, + "loss": 0.0426, + "num_tokens": 79764434.0, + "reward": 0.2578125, + "reward_std": 0.26355957984924316, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999713897705078, + "sampling/importance_sampling_ratio/min": 0.0007411236292682588, + "sampling/sampling_logp_difference/max": 7.207343101501465, + "sampling/sampling_logp_difference/mean": 0.020526543259620667, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.856050622947805e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.856050622947805e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13689.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 4856.53125, + "completions/mean_terminated_length": 4856.53125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 1.0780886858701706, + "epoch": 0.09935602575896964, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0033157530706375837, + "learning_rate": 1e-05, + "loss": 0.046, + "num_tokens": 80405238.0, + "reward": 0.3359375, + "reward_std": 0.3487703502178192, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.033773623406887054, + "sampling/sampling_logp_difference/max": 3.7256407737731934, + "sampling/sampling_logp_difference/mean": 0.019188418984413147, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.975351790406421e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.975351790406421e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16335.0, + "completions/max_terminated_length": 16335.0, + "completions/mean_length": 3930.5859375, + "completions/mean_terminated_length": 3930.5859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8666863515973091, + "epoch": 0.10027598896044158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005471619311720133, + "learning_rate": 1e-05, + "loss": -0.0779, + "num_tokens": 80926721.0, + "reward": 0.5859375, + "reward_std": 0.3164186179637909, + "rewards/accuracy_reward/mean": 0.5859375, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040531158447, + "sampling/importance_sampling_ratio/min": 0.0002562212466727942, + "sampling/sampling_logp_difference/max": 8.269469261169434, + "sampling/sampling_logp_difference/mean": 0.017708823084831238, + "step": 109 + }, + { + "clip_ratio/high_max": 6.743997801095247e-06, + "clip_ratio/high_mean": 1.6859994502738118e-06, + "clip_ratio/low_mean": 3.61007656692891e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.7786765119562915e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15546.0, + "completions/mean_length": 5934.9453125, + "completions/mean_terminated_length": 5684.16845703125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.9991667941212654, + "epoch": 0.10119595216191353, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002580739092081785, + "learning_rate": 1e-05, + "loss": -0.0065, + "num_tokens": 81707978.0, + "reward": 0.3046875, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.002478762762621045, + "sampling/sampling_logp_difference/max": 5.999995708465576, + "sampling/sampling_logp_difference/mean": 0.019801246002316475, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.43532002741631e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.43532002741631e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 5866.84375, + "completions/mean_terminated_length": 5699.9052734375, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.9848997294902802, + "epoch": 0.10211591536338546, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0010949905263260007, + "learning_rate": 1e-05, + "loss": 0.0266, + "num_tokens": 82477310.0, + "reward": 0.2734375, + "reward_std": 0.26933354139328003, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999667406082153, + "sampling/importance_sampling_ratio/min": 9.04304688447155e-05, + "sampling/sampling_logp_difference/max": 9.310929298400879, + "sampling/sampling_logp_difference/mean": 0.020769795402884483, + "step": 111 + }, + { + "clip_ratio/high_max": 1.9307613456476247e-05, + "clip_ratio/high_mean": 4.826903364119062e-06, + "clip_ratio/low_mean": 5.842190330440644e-05, + "clip_ratio/low_min": 1.2287753634154797e-05, + "clip_ratio/region_mean": 6.324880496322294e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14501.0, + "completions/max_terminated_length": 14501.0, + "completions/mean_length": 6613.7578125, + "completions/mean_terminated_length": 6613.7578125, + "completions/min_length": 1033.0, + "completions/min_terminated_length": 1033.0, + "entropy": 0.9176012054085732, + "epoch": 0.10303587856485741, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020384234376251698, + "learning_rate": 1e-05, + "loss": 0.0571, + "num_tokens": 83345055.0, + "reward": 0.3671875, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999457001686096, + "sampling/importance_sampling_ratio/min": 0.029541675001382828, + "sampling/sampling_logp_difference/max": 3.5219533443450928, + "sampling/sampling_logp_difference/mean": 0.018883168697357178, + "step": 112 + }, + { + "clip_ratio/high_max": 1.382043183184578e-05, + "clip_ratio/high_mean": 3.455107957961445e-06, + "clip_ratio/low_mean": 5.789885449303256e-05, + "clip_ratio/low_min": 1.017130716718384e-05, + "clip_ratio/region_mean": 6.135396188255982e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16310.0, + "completions/mean_length": 6392.3125, + "completions/mean_terminated_length": 6070.0, + "completions/min_length": 507.0, + "completions/min_terminated_length": 507.0, + "entropy": 0.904954232275486, + "epoch": 0.10395584176632934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031166900880634785, + "learning_rate": 1e-05, + "loss": 0.0351, + "num_tokens": 84186343.0, + "reward": 0.390625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.00022529886336997151, + "sampling/sampling_logp_difference/max": 8.398082733154297, + "sampling/sampling_logp_difference/mean": 0.01931958645582199, + "step": 113 + }, + { + "clip_ratio/high_max": 1.7221671441802755e-05, + "clip_ratio/high_mean": 6.549099907715572e-06, + "clip_ratio/low_mean": 3.147818074467068e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.802728065238625e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16180.0, + "completions/mean_length": 5982.703125, + "completions/mean_terminated_length": 5817.603515625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.8394555225968361, + "epoch": 0.10487580496780129, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022041688207536936, + "learning_rate": 1e-05, + "loss": 0.1043, + "num_tokens": 84971129.0, + "reward": 0.3125, + "reward_std": 0.30774885416030884, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999030828475952, + "sampling/importance_sampling_ratio/min": 1.553593506287143e-06, + "sampling/sampling_logp_difference/max": 13.374939918518066, + "sampling/sampling_logp_difference/mean": 0.01795877143740654, + "step": 114 + }, + { + "clip_ratio/high_max": 2.9651660042873118e-05, + "clip_ratio/high_mean": 9.398806923854863e-06, + "clip_ratio/low_mean": 4.788733849636628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.728614519284747e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14988.0, + "completions/mean_length": 4976.921875, + "completions/mean_terminated_length": 4608.95166015625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.8381234556436539, + "epoch": 0.10579576816927323, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0037972736172378063, + "learning_rate": 1e-05, + "loss": 0.1244, + "num_tokens": 85625559.0, + "reward": 0.4765625, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970555305481, + "sampling/importance_sampling_ratio/min": 0.002990707289427519, + "sampling/sampling_logp_difference/max": 5.8122453689575195, + "sampling/sampling_logp_difference/mean": 0.01815030723810196, + "step": 115 + }, + { + "clip_ratio/high_max": 4.130592969886493e-06, + "clip_ratio/high_mean": 1.0326482424716232e-06, + "clip_ratio/low_mean": 1.6904315600640984e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7936963843112608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15984.0, + "completions/mean_length": 6307.2421875, + "completions/mean_terminated_length": 6065.400390625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 1.1176434755325317, + "epoch": 0.10671573137074516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0012413962977007031, + "learning_rate": 1e-05, + "loss": 0.0146, + "num_tokens": 86453606.0, + "reward": 0.28125, + "reward_std": 0.2280253767967224, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.004730688873678446, + "sampling/sampling_logp_difference/max": 5.353684425354004, + "sampling/sampling_logp_difference/mean": 0.021790307015180588, + "step": 116 + }, + { + "clip_ratio/high_max": 1.3160772823539446e-05, + "clip_ratio/high_mean": 3.2901932058848615e-06, + "clip_ratio/low_mean": 3.582628983167524e-05, + "clip_ratio/low_min": 2.61966624748311e-06, + "clip_ratio/region_mean": 3.911648195753514e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16329.0, + "completions/mean_length": 7263.1640625, + "completions/mean_terminated_length": 7044.26416015625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 1.107876107096672, + "epoch": 0.10763569457221711, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017762042116373777, + "learning_rate": 1e-05, + "loss": 0.0349, + "num_tokens": 87402763.0, + "reward": 0.2578125, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741315841675, + "sampling/importance_sampling_ratio/min": 0.0009408573969267309, + "sampling/sampling_logp_difference/max": 6.968719005584717, + "sampling/sampling_logp_difference/mean": 0.02103034406900406, + "step": 117 + }, + { + "clip_ratio/high_max": 3.987745776612428e-05, + "clip_ratio/high_mean": 1.1877163728968299e-05, + "clip_ratio/low_mean": 4.26799579145154e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.455712096136267e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15416.0, + "completions/mean_length": 5093.859375, + "completions/mean_terminated_length": 4914.65087890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 1.1065888702869415, + "epoch": 0.10855565777368906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032127038575708866, + "learning_rate": 1e-05, + "loss": 0.0194, + "num_tokens": 88077385.0, + "reward": 0.421875, + "reward_std": 0.345874547958374, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999171495437622, + "sampling/importance_sampling_ratio/min": 7.033879228401929e-05, + "sampling/sampling_logp_difference/max": 9.562187194824219, + "sampling/sampling_logp_difference/mean": 0.020314980298280716, + "step": 118 + }, + { + "clip_ratio/high_max": 9.35208754526684e-06, + "clip_ratio/high_mean": 4.4788730519940145e-06, + "clip_ratio/low_mean": 3.470697703278347e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.918584917528278e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15740.0, + "completions/mean_length": 6943.53125, + "completions/mean_terminated_length": 6639.0, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.9009081721305847, + "epoch": 0.10947562097516099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028925195802003145, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 88985269.0, + "reward": 0.3984375, + "reward_std": 0.3535328209400177, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 6.553035092338177e-08, + "sampling/sampling_logp_difference/max": 16.540752410888672, + "sampling/sampling_logp_difference/mean": 0.019378282129764557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0939961612166371e-05, + "clip_ratio/high_mean": 2.734990403041593e-06, + "clip_ratio/low_mean": 2.4615862798782473e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7350853201824066e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15148.0, + "completions/max_terminated_length": 15148.0, + "completions/mean_length": 4976.25, + "completions/mean_terminated_length": 4976.25, + "completions/min_length": 702.0, + "completions/min_terminated_length": 702.0, + "entropy": 0.9463540017604828, + "epoch": 0.11039558417663294, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017386430408805609, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 89645205.0, + "reward": 0.359375, + "reward_std": 0.26462042331695557, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999554753303528, + "sampling/importance_sampling_ratio/min": 7.889595508459024e-06, + "sampling/sampling_logp_difference/max": 11.74996566772461, + "sampling/sampling_logp_difference/mean": 0.018035830929875374, + "step": 120 + }, + { + "clip_ratio/high_max": 5.941629297012696e-06, + "clip_ratio/high_mean": 1.485407324253174e-06, + "clip_ratio/low_mean": 2.6826061798601586e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8311469009167922e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15892.0, + "completions/mean_length": 6439.5390625, + "completions/mean_terminated_length": 6281.69091796875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.899876207113266, + "epoch": 0.11131554737810488, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0037381781730800867, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 90489394.0, + "reward": 0.3203125, + "reward_std": 0.2624938488006592, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999206066131592, + "sampling/importance_sampling_ratio/min": 0.003606764366850257, + "sampling/sampling_logp_difference/max": 5.62494421005249, + "sampling/sampling_logp_difference/mean": 0.019368179142475128, + "step": 121 + }, + { + "clip_ratio/high_max": 5.189952389628161e-06, + "clip_ratio/high_mean": 1.2974880974070402e-06, + "clip_ratio/low_mean": 3.058137212974543e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.187886022715247e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 6876.46875, + "completions/mean_terminated_length": 6408.884765625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1018569767475128, + "epoch": 0.11223551057957681, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018562980694696307, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 91390054.0, + "reward": 0.21875, + "reward_std": 0.29955869913101196, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999849796295166, + "sampling/importance_sampling_ratio/min": 2.9343695132411085e-05, + "sampling/sampling_logp_difference/max": 10.436432838439941, + "sampling/sampling_logp_difference/mean": 0.020825792104005814, + "step": 122 + }, + { + "clip_ratio/high_max": 2.022083435804234e-05, + "clip_ratio/high_mean": 5.055208589510585e-06, + "clip_ratio/low_mean": 3.029032552603894e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.53455343429232e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14153.0, + "completions/mean_length": 6501.5078125, + "completions/mean_terminated_length": 6344.64306640625, + "completions/min_length": 720.0, + "completions/min_terminated_length": 720.0, + "entropy": 1.073579266667366, + "epoch": 0.11315547378104876, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016695430967956781, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 92241535.0, + "reward": 0.2734375, + "reward_std": 0.28641316294670105, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.0002380236255703494, + "sampling/sampling_logp_difference/max": 8.343140602111816, + "sampling/sampling_logp_difference/mean": 0.020438479259610176, + "step": 123 + }, + { + "clip_ratio/high_max": 3.3911180707946187e-06, + "clip_ratio/high_mean": 8.477795176986547e-07, + "clip_ratio/low_mean": 2.2190370486896427e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.30381500614385e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14345.0, + "completions/max_terminated_length": 14345.0, + "completions/mean_length": 5474.1328125, + "completions/mean_terminated_length": 5474.1328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 1.0692576617002487, + "epoch": 0.1140754369825207, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0034909825772047043, + "learning_rate": 1e-05, + "loss": 0.0, + "num_tokens": 92962472.0, + "reward": 0.3046875, + "reward_std": 0.27564430236816406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.0017851731972768903, + "sampling/sampling_logp_difference/max": 6.328239917755127, + "sampling/sampling_logp_difference/mean": 0.019930578768253326, + "step": 124 + }, + { + "clip_ratio/high_max": 2.6292200345778838e-05, + "clip_ratio/high_mean": 7.620442374900449e-06, + "clip_ratio/low_mean": 4.615546390596137e-05, + "clip_ratio/low_min": 1.366510537081922e-05, + "clip_ratio/region_mean": 5.3775906508235494e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16198.0, + "completions/mean_length": 7512.078125, + "completions/mean_terminated_length": 7225.88671875, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9676955863833427, + "epoch": 0.11499540018399264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023449272848665714, + "learning_rate": 1e-05, + "loss": 0.0454, + "num_tokens": 93950506.0, + "reward": 0.3203125, + "reward_std": 0.22461043298244476, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.0016406332142651081, + "sampling/sampling_logp_difference/max": 6.412672996520996, + "sampling/sampling_logp_difference/mean": 0.020141655579209328, + "step": 125 + }, + { + "clip_ratio/high_max": 5.097255780128762e-06, + "clip_ratio/high_mean": 1.2743139450321905e-06, + "clip_ratio/low_mean": 3.3802551342887455e-05, + "clip_ratio/low_min": 4.146762421441963e-06, + "clip_ratio/region_mean": 3.5076865287919645e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 6920.484375, + "completions/mean_terminated_length": 6693.3603515625, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "entropy": 0.8662540689110756, + "epoch": 0.11591536338546458, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0037103090435266495, + "learning_rate": 1e-05, + "loss": 0.0617, + "num_tokens": 94854016.0, + "reward": 0.4375, + "reward_std": 0.322716623544693, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999761581420898, + "sampling/importance_sampling_ratio/min": 0.00047686786274425685, + "sampling/sampling_logp_difference/max": 7.648271083831787, + "sampling/sampling_logp_difference/mean": 0.01915796287357807, + "step": 126 + }, + { + "clip_ratio/high_max": 8.4922439782531e-06, + "clip_ratio/high_mean": 2.123060994563275e-06, + "clip_ratio/low_mean": 5.024227584726759e-05, + "clip_ratio/low_min": 1.3627016414829995e-05, + "clip_ratio/region_mean": 5.236533706920454e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15867.0, + "completions/mean_length": 7939.609375, + "completions/mean_terminated_length": 7805.57177734375, + "completions/min_length": 1260.0, + "completions/min_terminated_length": 1260.0, + "entropy": 0.9707008600234985, + "epoch": 0.11683532658693652, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024642283096909523, + "learning_rate": 1e-05, + "loss": 0.0788, + "num_tokens": 95889966.0, + "reward": 0.2265625, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998771548271179, + "sampling/importance_sampling_ratio/min": 4.540014560916461e-05, + "sampling/sampling_logp_difference/max": 9.999995231628418, + "sampling/sampling_logp_difference/mean": 0.020453302189707756, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.766829564710861e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.766829564710861e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14969.0, + "completions/mean_length": 5985.8203125, + "completions/mean_terminated_length": 5474.43408203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.9083090648055077, + "epoch": 0.11775528978840846, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003317479742690921, + "learning_rate": 1e-05, + "loss": 0.0537, + "num_tokens": 96676847.0, + "reward": 0.3671875, + "reward_std": 0.287486732006073, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130964279175, + "sampling/importance_sampling_ratio/min": 0.000286750087980181, + "sampling/sampling_logp_difference/max": 8.156899452209473, + "sampling/sampling_logp_difference/mean": 0.01996719278395176, + "step": 128 + }, + { + "clip_ratio/high_max": 1.8439853647578275e-05, + "clip_ratio/high_mean": 4.609963411894569e-06, + "clip_ratio/low_mean": 5.708034223061986e-05, + "clip_ratio/low_min": 2.75287948170444e-06, + "clip_ratio/region_mean": 6.169030598357494e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15081.0, + "completions/mean_length": 6565.359375, + "completions/mean_terminated_length": 6488.04736328125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 1.1013468354940414, + "epoch": 0.11867525298988041, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019073591101914644, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 97539453.0, + "reward": 0.2734375, + "reward_std": 0.307217001914978, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999555945396423, + "sampling/importance_sampling_ratio/min": 0.0006022047018632293, + "sampling/sampling_logp_difference/max": 7.414913177490234, + "sampling/sampling_logp_difference/mean": 0.02150837704539299, + "step": 129 + }, + { + "clip_ratio/high_max": 9.068485269381199e-06, + "clip_ratio/high_mean": 2.2671213173452998e-06, + "clip_ratio/low_mean": 1.9822365402433206e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.208948649240483e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16099.0, + "completions/mean_length": 6779.6171875, + "completions/mean_terminated_length": 6703.9921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8940552547574043, + "epoch": 0.11959521619135234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010163087863475084, + "learning_rate": 1e-05, + "loss": 0.0249, + "num_tokens": 98429036.0, + "reward": 0.453125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999485015869141, + "sampling/importance_sampling_ratio/min": 3.464699460664633e-08, + "sampling/sampling_logp_difference/max": 17.178054809570312, + "sampling/sampling_logp_difference/mean": 0.018716152757406235, + "step": 130 + }, + { + "clip_ratio/high_max": 5.047242211730918e-06, + "clip_ratio/high_mean": 1.2618105529327295e-06, + "clip_ratio/low_mean": 2.9014110396019532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0275920835265424e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14549.0, + "completions/max_terminated_length": 14549.0, + "completions/mean_length": 5766.71875, + "completions/mean_terminated_length": 5766.71875, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "entropy": 1.0455922111868858, + "epoch": 0.12051517939282429, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002155766822397709, + "learning_rate": 1e-05, + "loss": 0.0238, + "num_tokens": 99184264.0, + "reward": 0.4140625, + "reward_std": 0.3077537715435028, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253749847412, + "sampling/importance_sampling_ratio/min": 0.00010798005678225309, + "sampling/sampling_logp_difference/max": 9.133563995361328, + "sampling/sampling_logp_difference/mean": 0.020948775112628937, + "step": 131 + }, + { + "clip_ratio/high_max": 2.0882574972347356e-05, + "clip_ratio/high_mean": 6.505383225885453e-06, + "clip_ratio/low_mean": 4.496008500609605e-05, + "clip_ratio/low_min": 7.757854064038838e-06, + "clip_ratio/region_mean": 5.1465468231981504e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14704.0, + "completions/mean_length": 6167.2421875, + "completions/mean_terminated_length": 6005.07177734375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.9100174158811569, + "epoch": 0.12143514259429623, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021464223973453045, + "learning_rate": 1e-05, + "loss": -0.0279, + "num_tokens": 99996831.0, + "reward": 0.421875, + "reward_std": 0.3916535973548889, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240040779114, + "sampling/importance_sampling_ratio/min": 0.02249590866267681, + "sampling/sampling_logp_difference/max": 3.794421911239624, + "sampling/sampling_logp_difference/mean": 0.01866895705461502, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.0998018473837874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0998018473837874e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15738.0, + "completions/mean_length": 6242.9453125, + "completions/mean_terminated_length": 6163.09423828125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8624134212732315, + "epoch": 0.12235510579576817, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023277695290744305, + "learning_rate": 1e-05, + "loss": 0.0524, + "num_tokens": 100814112.0, + "reward": 0.3984375, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999959409236908, + "sampling/importance_sampling_ratio/min": 0.0002393616596236825, + "sampling/sampling_logp_difference/max": 8.33753490447998, + "sampling/sampling_logp_difference/mean": 0.0191188994795084, + "step": 133 + }, + { + "clip_ratio/high_max": 6.589872555196052e-06, + "clip_ratio/high_mean": 1.647468138799013e-06, + "clip_ratio/low_mean": 4.329304238126497e-05, + "clip_ratio/low_min": 3.5120251595799346e-06, + "clip_ratio/region_mean": 4.494051017900347e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14866.0, + "completions/mean_length": 5733.6875, + "completions/mean_terminated_length": 5478.080078125, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.9628067463636398, + "epoch": 0.12327506899724011, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003547821193933487, + "learning_rate": 1e-05, + "loss": 0.0321, + "num_tokens": 101566264.0, + "reward": 0.3984375, + "reward_std": 0.36584997177124023, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999400973320007, + "sampling/importance_sampling_ratio/min": 0.0001282035664189607, + "sampling/sampling_logp_difference/max": 8.961891174316406, + "sampling/sampling_logp_difference/mean": 0.019646761938929558, + "step": 134 + }, + { + "clip_ratio/high_max": 1.7107527582993498e-05, + "clip_ratio/high_mean": 4.2768818957483745e-06, + "clip_ratio/low_mean": 3.014796902789385e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.442485103732906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15848.0, + "completions/max_terminated_length": 15848.0, + "completions/mean_length": 5505.9375, + "completions/mean_terminated_length": 5505.9375, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.8041045889258385, + "epoch": 0.12419503219871206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024891747161746025, + "learning_rate": 1e-05, + "loss": 0.1406, + "num_tokens": 102291456.0, + "reward": 0.5, + "reward_std": 0.35482609272003174, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 0.0014627616619691253, + "sampling/sampling_logp_difference/max": 6.527429103851318, + "sampling/sampling_logp_difference/mean": 0.01716250739991665, + "step": 135 + }, + { + "clip_ratio/high_max": 1.548903105685895e-05, + "clip_ratio/high_mean": 3.872257764214737e-06, + "clip_ratio/low_mean": 5.380711581892683e-05, + "clip_ratio/low_min": 4.5777483137499075e-06, + "clip_ratio/region_mean": 5.767937363998499e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16005.0, + "completions/max_terminated_length": 16005.0, + "completions/mean_length": 5003.0625, + "completions/mean_terminated_length": 5003.0625, + "completions/min_length": 497.0, + "completions/min_terminated_length": 497.0, + "entropy": 0.9115714654326439, + "epoch": 0.125114995400184, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00220683915540576, + "learning_rate": 1e-05, + "loss": 0.1361, + "num_tokens": 102949824.0, + "reward": 0.4140625, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999973714351654, + "sampling/importance_sampling_ratio/min": 8.323705696966499e-05, + "sampling/sampling_logp_difference/max": 9.393817901611328, + "sampling/sampling_logp_difference/mean": 0.018076512962579727, + "step": 136 + }, + { + "clip_ratio/high_max": 2.181136096623959e-05, + "clip_ratio/high_mean": 5.4528402415598975e-06, + "clip_ratio/low_mean": 3.4416837252138066e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.986967681157694e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15658.0, + "completions/max_terminated_length": 15658.0, + "completions/mean_length": 4742.1328125, + "completions/mean_terminated_length": 4742.1328125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.9430246204137802, + "epoch": 0.12603495860165592, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003964806906878948, + "learning_rate": 1e-05, + "loss": 0.0215, + "num_tokens": 103580913.0, + "reward": 0.4609375, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999952495098114, + "sampling/importance_sampling_ratio/min": 7.031940185697749e-05, + "sampling/sampling_logp_difference/max": 9.56246280670166, + "sampling/sampling_logp_difference/mean": 0.019651200622320175, + "step": 137 + }, + { + "clip_ratio/high_max": 4.07684046876966e-06, + "clip_ratio/high_mean": 1.019210117192415e-06, + "clip_ratio/low_mean": 3.8682398553646635e-05, + "clip_ratio/low_min": 8.189203072106466e-06, + "clip_ratio/region_mean": 3.970160832977854e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 6574.171875, + "completions/mean_terminated_length": 6091.72119140625, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.8429529070854187, + "epoch": 0.12695492180312787, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002067410387098789, + "learning_rate": 1e-05, + "loss": 0.0377, + "num_tokens": 104447463.0, + "reward": 0.3125, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997583627700806, + "sampling/importance_sampling_ratio/min": 0.00021258489869069308, + "sampling/sampling_logp_difference/max": 8.456169128417969, + "sampling/sampling_logp_difference/mean": 0.018853647634387016, + "step": 138 + }, + { + "clip_ratio/high_max": 1.9725823221961036e-05, + "clip_ratio/high_mean": 4.931455805490259e-06, + "clip_ratio/low_mean": 5.9263072444082354e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.419452870431996e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15518.0, + "completions/max_terminated_length": 15518.0, + "completions/mean_length": 4581.5625, + "completions/mean_terminated_length": 4581.5625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.7094272822141647, + "epoch": 0.12787488500459981, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.004292502999305725, + "learning_rate": 1e-05, + "loss": 0.0946, + "num_tokens": 105052287.0, + "reward": 0.625, + "reward_std": 0.3908300995826721, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999477863311768, + "sampling/importance_sampling_ratio/min": 0.0019342642044648528, + "sampling/sampling_logp_difference/max": 6.24802827835083, + "sampling/sampling_logp_difference/mean": 0.016310662031173706, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0132298029930098e-05, + "clip_ratio/high_mean": 2.5330745074825245e-06, + "clip_ratio/low_mean": 4.6397121650443296e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.893019581686531e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16097.0, + "completions/mean_length": 7066.4453125, + "completions/mean_terminated_length": 6918.5478515625, + "completions/min_length": 990.0, + "completions/min_terminated_length": 990.0, + "entropy": 0.8481669947504997, + "epoch": 0.12879484820607176, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015785128343850374, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 105977048.0, + "reward": 0.3515625, + "reward_std": 0.27328038215637207, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.00104097044095397, + "sampling/sampling_logp_difference/max": 6.8676018714904785, + "sampling/sampling_logp_difference/mean": 0.018304405733942986, + "step": 140 + }, + { + "clip_ratio/high_max": 1.6989023606583942e-05, + "clip_ratio/high_mean": 4.2472559016459854e-06, + "clip_ratio/low_mean": 2.3075059743860038e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7322315418132348e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16104.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 6230.5234375, + "completions/mean_terminated_length": 6230.5234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9658062160015106, + "epoch": 0.1297148114075437, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002542720176279545, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 106793187.0, + "reward": 0.3203125, + "reward_std": 0.3050953149795532, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000169277191162, + "sampling/importance_sampling_ratio/min": 0.0002781494113150984, + "sampling/sampling_logp_difference/max": 8.187352180480957, + "sampling/sampling_logp_difference/mean": 0.019391046836972237, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7597974508353218e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7597974508353218e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14216.0, + "completions/mean_length": 5690.5546875, + "completions/mean_terminated_length": 5606.3544921875, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 1.0098655670881271, + "epoch": 0.13063477460901565, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001451602904126048, + "learning_rate": 1e-05, + "loss": 0.0444, + "num_tokens": 107539874.0, + "reward": 0.4296875, + "reward_std": 0.23304283618927002, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999307990074158, + "sampling/importance_sampling_ratio/min": 5.640022671116185e-09, + "sampling/sampling_logp_difference/max": 18.993377685546875, + "sampling/sampling_logp_difference/mean": 0.018607191741466522, + "step": 142 + }, + { + "clip_ratio/high_max": 1.2800467629858758e-05, + "clip_ratio/high_mean": 4.19954119479371e-06, + "clip_ratio/low_mean": 2.350350996493944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.770305115973315e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15791.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5471.1328125, + "completions/mean_terminated_length": 5471.1328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0413162112236023, + "epoch": 0.13155473781048757, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023549250327050686, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 108260091.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999832510948181, + "sampling/importance_sampling_ratio/min": 0.0011709182290360332, + "sampling/sampling_logp_difference/max": 6.749967098236084, + "sampling/sampling_logp_difference/mean": 0.020427243784070015, + "step": 143 + }, + { + "clip_ratio/high_max": 2.1983064925734652e-05, + "clip_ratio/high_mean": 5.495766231433663e-06, + "clip_ratio/low_mean": 4.361141452591255e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9107180757346214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 6211.7421875, + "completions/mean_terminated_length": 6050.2783203125, + "completions/min_length": 622.0, + "completions/min_terminated_length": 622.0, + "entropy": 0.9706784337759018, + "epoch": 0.13247470101195952, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017527056625112891, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 109073890.0, + "reward": 0.421875, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999092221260071, + "sampling/importance_sampling_ratio/min": 0.002898645820096135, + "sampling/sampling_logp_difference/max": 5.843511581420898, + "sampling/sampling_logp_difference/mean": 0.018898162990808487, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.208964992358233e-05, + "clip_ratio/low_min": 3.9168990042526275e-06, + "clip_ratio/region_mean": 4.208964992358233e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14880.0, + "completions/mean_length": 6007.8984375, + "completions/mean_terminated_length": 5926.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.1967609524726868, + "epoch": 0.13339466421343146, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0007858420140109956, + "learning_rate": 1e-05, + "loss": 0.011, + "num_tokens": 109861813.0, + "reward": 0.296875, + "reward_std": 0.23486506938934326, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340772628784, + "sampling/importance_sampling_ratio/min": 3.294382011631569e-08, + "sampling/sampling_logp_difference/max": 17.22846221923828, + "sampling/sampling_logp_difference/mean": 0.021845955401659012, + "step": 145 + }, + { + "clip_ratio/high_max": 4.5118208618077915e-06, + "clip_ratio/high_mean": 1.1279552154519479e-06, + "clip_ratio/low_mean": 3.749712686840212e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8625082197540905e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6800.9921875, + "completions/mean_terminated_length": 6725.53564453125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.0437887012958527, + "epoch": 0.1343146274149034, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0029428249690681696, + "learning_rate": 1e-05, + "loss": 0.0405, + "num_tokens": 110756572.0, + "reward": 0.265625, + "reward_std": 0.3248382806777954, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.0006329434108920395, + "sampling/sampling_logp_difference/max": 7.365129470825195, + "sampling/sampling_logp_difference/mean": 0.02010120078921318, + "step": 146 + }, + { + "clip_ratio/high_max": 1.427700522071973e-05, + "clip_ratio/high_mean": 3.5692513051799324e-06, + "clip_ratio/low_mean": 4.964020990883e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.320946092979284e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6309.4453125, + "completions/mean_terminated_length": 6230.1181640625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.9768906533718109, + "epoch": 0.13523459061637536, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002088683657348156, + "learning_rate": 1e-05, + "loss": 0.0316, + "num_tokens": 111585493.0, + "reward": 0.375, + "reward_std": 0.39796435832977295, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.009723234921693802, + "sampling/sampling_logp_difference/max": 4.633236885070801, + "sampling/sampling_logp_difference/mean": 0.020927833393216133, + "step": 147 + }, + { + "clip_ratio/high_max": 5.4841398196003865e-06, + "clip_ratio/high_mean": 1.3710349549000966e-06, + "clip_ratio/low_mean": 5.122006064084417e-05, + "clip_ratio/low_min": 3.785125954891555e-06, + "clip_ratio/region_mean": 5.25910957094311e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15209.0, + "completions/mean_length": 6221.859375, + "completions/mean_terminated_length": 6060.5556640625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.9212924689054489, + "epoch": 0.13615455381784727, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002406956860795617, + "learning_rate": 1e-05, + "loss": 0.1051, + "num_tokens": 112400363.0, + "reward": 0.40625, + "reward_std": 0.31929677724838257, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999701976776123, + "sampling/importance_sampling_ratio/min": 5.8308287407271564e-05, + "sampling/sampling_logp_difference/max": 9.74976634979248, + "sampling/sampling_logp_difference/mean": 0.018652018159627914, + "step": 148 + }, + { + "clip_ratio/high_max": 1.4568151755156578e-05, + "clip_ratio/high_mean": 3.6420379387891444e-06, + "clip_ratio/low_mean": 3.999794398623635e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.3639981413434725e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14997.0, + "completions/mean_length": 6942.8203125, + "completions/mean_terminated_length": 6716.232421875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.949538916349411, + "epoch": 0.13707451701931922, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022962254006415606, + "learning_rate": 1e-05, + "loss": 0.0625, + "num_tokens": 113308748.0, + "reward": 0.375, + "reward_std": 0.3329663872718811, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999334812164307, + "sampling/importance_sampling_ratio/min": 0.00048810525913722813, + "sampling/sampling_logp_difference/max": 7.624979496002197, + "sampling/sampling_logp_difference/mean": 0.01939917355775833, + "step": 149 + }, + { + "clip_ratio/high_max": 8.786732450971613e-06, + "clip_ratio/high_mean": 2.196683112742903e-06, + "clip_ratio/low_mean": 5.562954720517155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7826231113722315e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15182.0, + "completions/mean_length": 6783.1796875, + "completions/mean_terminated_length": 6552.76025390625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "entropy": 0.9774708449840546, + "epoch": 0.13799448022079117, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020560629200190306, + "learning_rate": 1e-05, + "loss": 0.0473, + "num_tokens": 114196235.0, + "reward": 0.34375, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998990297317505, + "sampling/importance_sampling_ratio/min": 2.4757892447269114e-07, + "sampling/sampling_logp_difference/max": 15.211536407470703, + "sampling/sampling_logp_difference/mean": 0.019691556692123413, + "step": 150 + }, + { + "clip_ratio/high_max": 1.799483243303257e-05, + "clip_ratio/high_mean": 4.498708108258143e-06, + "clip_ratio/low_mean": 2.6389980291696702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0888688343111426e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15549.0, + "completions/mean_length": 5568.15625, + "completions/mean_terminated_length": 5396.4765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.9303529411554337, + "epoch": 0.1389144434222631, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022214846685528755, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 114928047.0, + "reward": 0.234375, + "reward_std": 0.2585597634315491, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999408721923828, + "sampling/importance_sampling_ratio/min": 2.1446083337650634e-05, + "sampling/sampling_logp_difference/max": 10.749968528747559, + "sampling/sampling_logp_difference/mean": 0.01938418298959732, + "step": 151 + }, + { + "clip_ratio/high_max": 1.1957493370573502e-05, + "clip_ratio/high_mean": 2.9893733426433755e-06, + "clip_ratio/low_mean": 5.885063319510664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.184000585562899e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15340.0, + "completions/max_terminated_length": 15340.0, + "completions/mean_length": 6086.578125, + "completions/mean_terminated_length": 6086.578125, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "entropy": 0.9131873697042465, + "epoch": 0.13983440662373506, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002448044717311859, + "learning_rate": 1e-05, + "loss": 0.0599, + "num_tokens": 115725657.0, + "reward": 0.40625, + "reward_std": 0.35878273844718933, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999779462814331, + "sampling/importance_sampling_ratio/min": 0.02929726243019104, + "sampling/sampling_logp_difference/max": 3.530261278152466, + "sampling/sampling_logp_difference/mean": 0.019298439845442772, + "step": 152 + }, + { + "clip_ratio/high_max": 1.3385357760853367e-05, + "clip_ratio/high_mean": 3.3463394402133417e-06, + "clip_ratio/low_mean": 5.717015119444113e-05, + "clip_ratio/low_min": 3.4328400033700746e-06, + "clip_ratio/region_mean": 6.0516490520967636e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15987.0, + "completions/mean_length": 6442.5390625, + "completions/mean_terminated_length": 6203.9443359375, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "entropy": 0.8959419652819633, + "epoch": 0.140754369825207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002013204852119088, + "learning_rate": 1e-05, + "loss": 0.0281, + "num_tokens": 116571478.0, + "reward": 0.2734375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044584274292, + "sampling/importance_sampling_ratio/min": 1.0374163821325055e-06, + "sampling/sampling_logp_difference/max": 13.778777122497559, + "sampling/sampling_logp_difference/mean": 0.01925014518201351, + "step": 153 + }, + { + "clip_ratio/high_max": 9.34224021875707e-06, + "clip_ratio/high_mean": 3.136903728773177e-06, + "clip_ratio/low_mean": 2.9738095065567904e-05, + "clip_ratio/low_min": 3.7240065466903616e-06, + "clip_ratio/region_mean": 3.2874999135401595e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15946.0, + "completions/mean_length": 6633.5703125, + "completions/mean_terminated_length": 6319.0400390625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 1.0223619118332863, + "epoch": 0.14167433302667892, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024523327592760324, + "learning_rate": 1e-05, + "loss": 0.056, + "num_tokens": 117440743.0, + "reward": 0.3203125, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999213218688965, + "sampling/importance_sampling_ratio/min": 3.0026931199245155e-05, + "sampling/sampling_logp_difference/max": 10.413415908813477, + "sampling/sampling_logp_difference/mean": 0.02061290666460991, + "step": 154 + }, + { + "clip_ratio/high_max": 1.4537483366439119e-05, + "clip_ratio/high_mean": 3.6343708416097797e-06, + "clip_ratio/low_mean": 3.954866042477079e-05, + "clip_ratio/low_min": 9.874949228105834e-06, + "clip_ratio/region_mean": 4.318303126638057e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15919.0, + "completions/mean_length": 7183.0, + "completions/mean_terminated_length": 6886.193359375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.9815369099378586, + "epoch": 0.14259429622815087, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018688985146582127, + "learning_rate": 1e-05, + "loss": 0.0395, + "num_tokens": 118380687.0, + "reward": 0.2890625, + "reward_std": 0.2498900145292282, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999039173126221, + "sampling/importance_sampling_ratio/min": 1.3847662557964213e-05, + "sampling/sampling_logp_difference/max": 11.187394142150879, + "sampling/sampling_logp_difference/mean": 0.019792160019278526, + "step": 155 + }, + { + "clip_ratio/high_max": 7.165636361605721e-06, + "clip_ratio/high_mean": 1.7914090904014301e-06, + "clip_ratio/low_mean": 4.9011068711024564e-05, + "clip_ratio/low_min": 1.0991705721608014e-05, + "clip_ratio/region_mean": 5.0802477687739156e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 6324.640625, + "completions/mean_terminated_length": 5829.91748046875, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "entropy": 0.852975606918335, + "epoch": 0.14351425942962281, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002005894435569644, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 119207089.0, + "reward": 0.3984375, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 5.788659223071591e-07, + "sampling/sampling_logp_difference/max": 14.362195014953613, + "sampling/sampling_logp_difference/mean": 0.01853565312922001, + "step": 156 + }, + { + "clip_ratio/high_max": 7.795394822096569e-06, + "clip_ratio/high_mean": 1.948848705524142e-06, + "clip_ratio/low_mean": 3.834237736555224e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0291225786859286e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16281.0, + "completions/mean_length": 5723.421875, + "completions/mean_terminated_length": 5290.06494140625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8744911625981331, + "epoch": 0.14443422263109476, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002577397273853421, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 119961895.0, + "reward": 0.390625, + "reward_std": 0.34321609139442444, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.07882421463727951, + "sampling/sampling_logp_difference/max": 2.5405349731445312, + "sampling/sampling_logp_difference/mean": 0.018341556191444397, + "step": 157 + }, + { + "clip_ratio/high_max": 9.214097190124448e-06, + "clip_ratio/high_mean": 2.303524297531112e-06, + "clip_ratio/low_mean": 2.636873176697918e-05, + "clip_ratio/low_min": 2.9339967113628518e-06, + "clip_ratio/region_mean": 2.8672255837136618e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16055.0, + "completions/mean_length": 7886.015625, + "completions/mean_terminated_length": 7682.064453125, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.9391767829656601, + "epoch": 0.1453541858325667, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002552987542003393, + "learning_rate": 1e-05, + "loss": 0.0273, + "num_tokens": 120990289.0, + "reward": 0.328125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000030994415283, + "sampling/importance_sampling_ratio/min": 0.000899312668479979, + "sampling/sampling_logp_difference/max": 7.013879776000977, + "sampling/sampling_logp_difference/mean": 0.02049873024225235, + "step": 158 + }, + { + "clip_ratio/high_max": 3.406416203688423e-05, + "clip_ratio/high_mean": 9.72330332160709e-06, + "clip_ratio/low_mean": 3.168332909808669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.140663151019908e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16276.0, + "completions/mean_length": 6173.1640625, + "completions/mean_terminated_length": 6011.087890625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.9148785546422005, + "epoch": 0.14627414903403863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002678362652659416, + "learning_rate": 1e-05, + "loss": 0.039, + "num_tokens": 121797958.0, + "reward": 0.4140625, + "reward_std": 0.3608373999595642, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.002013920107856393, + "sampling/sampling_logp_difference/max": 6.207672119140625, + "sampling/sampling_logp_difference/mean": 0.018977735191583633, + "step": 159 + }, + { + "clip_ratio/high_max": 1.8476588593330234e-05, + "clip_ratio/high_mean": 4.6191471483325586e-06, + "clip_ratio/low_mean": 4.459614581264759e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9215293188353826e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 6594.21875, + "completions/mean_terminated_length": 6196.259765625, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9486038386821747, + "epoch": 0.14719411223551057, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033711253199726343, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 122661170.0, + "reward": 0.3828125, + "reward_std": 0.30457615852355957, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998981356620789, + "sampling/importance_sampling_ratio/min": 0.0002968419576063752, + "sampling/sampling_logp_difference/max": 8.122310638427734, + "sampling/sampling_logp_difference/mean": 0.01938377134501934, + "step": 160 + }, + { + "clip_ratio/high_max": 7.97335997049231e-06, + "clip_ratio/high_mean": 2.7343705824023345e-06, + "clip_ratio/low_mean": 5.420079878604156e-05, + "clip_ratio/low_min": 4.594068286678521e-06, + "clip_ratio/region_mean": 5.693517005056492e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 6533.9453125, + "completions/mean_terminated_length": 6377.595703125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9986584335565567, + "epoch": 0.14811407543698252, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017857529455795884, + "learning_rate": 1e-05, + "loss": 0.0804, + "num_tokens": 123518107.0, + "reward": 0.34375, + "reward_std": 0.3356248140335083, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998549818992615, + "sampling/importance_sampling_ratio/min": 9.012701411847956e-06, + "sampling/sampling_logp_difference/max": 11.616875648498535, + "sampling/sampling_logp_difference/mean": 0.02010391652584076, + "step": 161 + }, + { + "clip_ratio/high_max": 4.470512521947967e-06, + "clip_ratio/high_mean": 1.1176281304869917e-06, + "clip_ratio/low_mean": 3.5141094485879876e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.625872295742738e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13212.0, + "completions/mean_length": 5742.21875, + "completions/mean_terminated_length": 5658.42529296875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 1.0379670709371567, + "epoch": 0.14903403863845446, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018227624241262674, + "learning_rate": 1e-05, + "loss": -0.0237, + "num_tokens": 124279031.0, + "reward": 0.21875, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.0020977305248379707, + "sampling/sampling_logp_difference/max": 6.16689920425415, + "sampling/sampling_logp_difference/mean": 0.019987668842077255, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0003542683989508e-05, + "clip_ratio/high_mean": 3.21091931709816e-06, + "clip_ratio/low_mean": 5.731009014198207e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0521009800140746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16354.0, + "completions/mean_length": 7584.703125, + "completions/mean_terminated_length": 7515.41748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.953459307551384, + "epoch": 0.1499540018399264, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002219022251665592, + "learning_rate": 1e-05, + "loss": 0.0837, + "num_tokens": 125270761.0, + "reward": 0.359375, + "reward_std": 0.37033066153526306, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999880790710449, + "sampling/importance_sampling_ratio/min": 0.0024849213659763336, + "sampling/sampling_logp_difference/max": 5.997514247894287, + "sampling/sampling_logp_difference/mean": 0.020291510969400406, + "step": 163 + }, + { + "clip_ratio/high_max": 7.734669452474918e-06, + "clip_ratio/high_mean": 1.9336673631187296e-06, + "clip_ratio/low_mean": 3.1135301298945706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3068968605221016e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16133.0, + "completions/mean_length": 4714.671875, + "completions/mean_terminated_length": 4622.78759765625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 1.018719919025898, + "epoch": 0.15087396504139836, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0014189074281603098, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 125895279.0, + "reward": 0.3984375, + "reward_std": 0.28383445739746094, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999479651451111, + "sampling/importance_sampling_ratio/min": 4.017410901724361e-07, + "sampling/sampling_logp_difference/max": 14.727458000183105, + "sampling/sampling_logp_difference/mean": 0.018739396706223488, + "step": 164 + }, + { + "clip_ratio/high_max": 1.0069575182569679e-05, + "clip_ratio/high_mean": 2.5173937956424197e-06, + "clip_ratio/low_mean": 3.824179225375701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0759185367278405e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15913.0, + "completions/mean_length": 6316.140625, + "completions/mean_terminated_length": 6074.51220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.9325072392821312, + "epoch": 0.15179392824287027, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001702460227534175, + "learning_rate": 1e-05, + "loss": 0.1007, + "num_tokens": 126722881.0, + "reward": 0.4609375, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999539852142334, + "sampling/importance_sampling_ratio/min": 0.0012551364488899708, + "sampling/sampling_logp_difference/max": 6.680510997772217, + "sampling/sampling_logp_difference/mean": 0.01929408684372902, + "step": 165 + }, + { + "clip_ratio/high_max": 6.873041002108948e-06, + "clip_ratio/high_mean": 1.718260250527237e-06, + "clip_ratio/low_mean": 3.119859468370123e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.291685527528898e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15832.0, + "completions/mean_length": 4687.140625, + "completions/mean_terminated_length": 4595.03955078125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 1.0886607319116592, + "epoch": 0.15271389144434222, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032931750174611807, + "learning_rate": 1e-05, + "loss": 0.0078, + "num_tokens": 127341715.0, + "reward": 0.28125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999821186065674, + "sampling/importance_sampling_ratio/min": 0.0019364450126886368, + "sampling/sampling_logp_difference/max": 6.246901512145996, + "sampling/sampling_logp_difference/mean": 0.020621225237846375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.773085250533768e-05, + "clip_ratio/high_mean": 4.43271312633442e-06, + "clip_ratio/low_mean": 4.30743207289197e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7507033741567284e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14125.0, + "completions/mean_length": 5705.515625, + "completions/mean_terminated_length": 5449.232421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0523068830370903, + "epoch": 0.15363385464581417, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0031696646474301815, + "learning_rate": 1e-05, + "loss": -0.0414, + "num_tokens": 128093597.0, + "reward": 0.1953125, + "reward_std": 0.21778053045272827, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619126319885, + "sampling/importance_sampling_ratio/min": 3.197810656274669e-05, + "sampling/sampling_logp_difference/max": 10.350459098815918, + "sampling/sampling_logp_difference/mean": 0.021961934864521027, + "step": 167 + }, + { + "clip_ratio/high_max": 1.885905066956184e-05, + "clip_ratio/high_mean": 4.71476266739046e-06, + "clip_ratio/low_mean": 5.0530389898995054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.524515336219338e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15958.0, + "completions/mean_length": 6214.4921875, + "completions/mean_terminated_length": 6053.07177734375, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.9371421113610268, + "epoch": 0.1545538178472861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023704832419753075, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 128906948.0, + "reward": 0.40625, + "reward_std": 0.34139877557754517, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.0003354824730195105, + "sampling/sampling_logp_difference/max": 7.999940872192383, + "sampling/sampling_logp_difference/mean": 0.01882763020694256, + "step": 168 + }, + { + "clip_ratio/high_max": 3.042072216885572e-05, + "clip_ratio/high_mean": 7.60518054221393e-06, + "clip_ratio/low_mean": 4.5897569179942366e-05, + "clip_ratio/low_min": 8.727477506909054e-06, + "clip_ratio/region_mean": 5.3502750233747065e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 7127.0703125, + "completions/mean_terminated_length": 7054.18115234375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.9854387491941452, + "epoch": 0.15547378104875806, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003370177699252963, + "learning_rate": 1e-05, + "loss": 0.1197, + "num_tokens": 129839813.0, + "reward": 0.359375, + "reward_std": 0.3329663574695587, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999907910823822, + "sampling/importance_sampling_ratio/min": 1.077816432371037e-05, + "sampling/sampling_logp_difference/max": 11.43798828125, + "sampling/sampling_logp_difference/mean": 0.019736800342798233, + "step": 169 + }, + { + "clip_ratio/high_max": 2.1401074718596647e-05, + "clip_ratio/high_mean": 6.243764005375851e-06, + "clip_ratio/low_mean": 3.2797592325550795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.904135610355297e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15865.0, + "completions/mean_length": 6566.2890625, + "completions/mean_terminated_length": 6330.6640625, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "entropy": 0.7978609576821327, + "epoch": 0.15639374425023, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0026055986527353525, + "learning_rate": 1e-05, + "loss": 0.0661, + "num_tokens": 130698370.0, + "reward": 0.5, + "reward_std": 0.36295419931411743, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999133944511414, + "sampling/importance_sampling_ratio/min": 0.00031152591691352427, + "sampling/sampling_logp_difference/max": 8.074028015136719, + "sampling/sampling_logp_difference/mean": 0.01787097379565239, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0564424403346493e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0564424403346493e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15576.0, + "completions/max_terminated_length": 15576.0, + "completions/mean_length": 7186.2890625, + "completions/mean_terminated_length": 7186.2890625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 1.0232757329940796, + "epoch": 0.15731370745170192, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0023866184055805206, + "learning_rate": 1e-05, + "loss": 0.0683, + "num_tokens": 131637439.0, + "reward": 0.2734375, + "reward_std": 0.2059282809495926, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999207258224487, + "sampling/importance_sampling_ratio/min": 0.0007378471200354397, + "sampling/sampling_logp_difference/max": 7.211773872375488, + "sampling/sampling_logp_difference/mean": 0.02137116715312004, + "step": 171 + }, + { + "clip_ratio/high_max": 4.037900725961663e-05, + "clip_ratio/high_mean": 1.0094751814904157e-05, + "clip_ratio/low_mean": 5.8380828136250784e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.847557995115494e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13638.0, + "completions/mean_length": 5591.5703125, + "completions/mean_terminated_length": 5420.26220703125, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "entropy": 0.9335208311676979, + "epoch": 0.15823367065317387, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003491115989163518, + "learning_rate": 1e-05, + "loss": 0.0699, + "num_tokens": 132371816.0, + "reward": 0.5, + "reward_std": 0.3406373858451843, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999891459941864, + "sampling/importance_sampling_ratio/min": 0.00012356207298580557, + "sampling/sampling_logp_difference/max": 8.998766899108887, + "sampling/sampling_logp_difference/mean": 0.018760837614536285, + "step": 172 + }, + { + "clip_ratio/high_max": 2.8378776733006816e-06, + "clip_ratio/high_mean": 7.094694183251704e-07, + "clip_ratio/low_mean": 4.4085751369493664e-05, + "clip_ratio/low_min": 6.7955093072669115e-06, + "clip_ratio/region_mean": 4.4795220674132e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16302.0, + "completions/mean_length": 7152.3828125, + "completions/mean_terminated_length": 6930.82421875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.1329835206270218, + "epoch": 0.15915363385464582, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002830669516697526, + "learning_rate": 1e-05, + "loss": 0.0526, + "num_tokens": 133307297.0, + "reward": 0.28125, + "reward_std": 0.28801077604293823, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.00028047082014381886, + "sampling/sampling_logp_difference/max": 8.179040908813477, + "sampling/sampling_logp_difference/mean": 0.021548541262745857, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0150829439226072e-05, + "clip_ratio/high_mean": 2.537707359806518e-06, + "clip_ratio/low_mean": 3.4009618616437365e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.654732597624388e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15068.0, + "completions/mean_length": 7263.453125, + "completions/mean_terminated_length": 7118.68310546875, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 1.092760555446148, + "epoch": 0.16007359705611776, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0027821618132293224, + "learning_rate": 1e-05, + "loss": 0.0541, + "num_tokens": 134260107.0, + "reward": 0.3203125, + "reward_std": 0.2858940362930298, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999946117401123, + "sampling/importance_sampling_ratio/min": 7.832317351130769e-05, + "sampling/sampling_logp_difference/max": 9.454667091369629, + "sampling/sampling_logp_difference/mean": 0.022098438814282417, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0561876024439698e-05, + "clip_ratio/high_mean": 2.6404690061099245e-06, + "clip_ratio/low_mean": 1.6864279416495265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9504748649978865e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15388.0, + "completions/mean_length": 7088.8125, + "completions/mean_terminated_length": 6710.958984375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 1.0669445469975471, + "epoch": 0.1609935602575897, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0007076738984324038, + "learning_rate": 1e-05, + "loss": -0.0197, + "num_tokens": 135186139.0, + "reward": 0.328125, + "reward_std": 0.20593319833278656, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 3.084653872065246e-05, + "sampling/sampling_logp_difference/max": 10.386486053466797, + "sampling/sampling_logp_difference/mean": 0.020075790584087372, + "step": 175 + }, + { + "clip_ratio/high_max": 7.095016371749807e-06, + "clip_ratio/high_mean": 1.7737540929374518e-06, + "clip_ratio/low_mean": 2.7592465016823553e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.936621888238733e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15626.0, + "completions/max_terminated_length": 15626.0, + "completions/mean_length": 5352.734375, + "completions/mean_terminated_length": 5352.734375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 1.0387161895632744, + "epoch": 0.16191352345906163, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0022445612121373415, + "learning_rate": 1e-05, + "loss": 0.0261, + "num_tokens": 135888929.0, + "reward": 0.4765625, + "reward_std": 0.399257630109787, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 0.00032565294532105327, + "sampling/sampling_logp_difference/max": 8.029678344726562, + "sampling/sampling_logp_difference/mean": 0.02010166086256504, + "step": 176 + }, + { + "clip_ratio/high_max": 1.5100852124305675e-05, + "clip_ratio/high_mean": 4.426987970873597e-06, + "clip_ratio/low_mean": 2.7625993425317574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2052981168817496e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16266.0, + "completions/mean_length": 7758.90625, + "completions/mean_terminated_length": 7408.29248046875, + "completions/min_length": 742.0, + "completions/min_terminated_length": 742.0, + "entropy": 1.0648984238505363, + "epoch": 0.16283348666053357, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022021254990249872, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 136901941.0, + "reward": 0.3671875, + "reward_std": 0.2914257347583771, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858140945435, + "sampling/importance_sampling_ratio/min": 2.2461865967216e-07, + "sampling/sampling_logp_difference/max": 15.30886173248291, + "sampling/sampling_logp_difference/mean": 0.021426808089017868, + "step": 177 + }, + { + "clip_ratio/high_max": 2.5346608254039893e-05, + "clip_ratio/high_mean": 7.4063813144675805e-06, + "clip_ratio/low_mean": 2.2069365058996482e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9475746259777225e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16277.0, + "completions/mean_length": 7036.953125, + "completions/mean_terminated_length": 6496.21484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9684997871518135, + "epoch": 0.16375344986200552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013461806811392307, + "learning_rate": 1e-05, + "loss": 0.035, + "num_tokens": 137824623.0, + "reward": 0.34375, + "reward_std": 0.2546031177043915, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999944806098938, + "sampling/importance_sampling_ratio/min": 5.834372132085264e-05, + "sampling/sampling_logp_difference/max": 9.74915885925293, + "sampling/sampling_logp_difference/mean": 0.020304443314671516, + "step": 178 + }, + { + "clip_ratio/high_max": 1.3147734080121154e-05, + "clip_ratio/high_mean": 3.2869335200302885e-06, + "clip_ratio/low_mean": 4.841489999307669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.170183294467279e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15500.0, + "completions/mean_length": 6114.1875, + "completions/mean_terminated_length": 5951.1748046875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.943072073161602, + "epoch": 0.16467341306347746, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002132438588887453, + "learning_rate": 1e-05, + "loss": 0.0943, + "num_tokens": 138625247.0, + "reward": 0.40625, + "reward_std": 0.321650892496109, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999298453330994, + "sampling/importance_sampling_ratio/min": 0.0017275095451623201, + "sampling/sampling_logp_difference/max": 6.361074447631836, + "sampling/sampling_logp_difference/mean": 0.020084267482161522, + "step": 179 + }, + { + "clip_ratio/high_max": 1.7873157958092634e-05, + "clip_ratio/high_mean": 4.468289489523158e-06, + "clip_ratio/low_mean": 3.5252990301160025e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9721279790683184e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15050.0, + "completions/mean_length": 7618.875, + "completions/mean_terminated_length": 7034.53369140625, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "entropy": 0.9142575263977051, + "epoch": 0.1655933762649494, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026741649489849806, + "learning_rate": 1e-05, + "loss": 0.0666, + "num_tokens": 139619287.0, + "reward": 0.2890625, + "reward_std": 0.2927239239215851, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.005949751473963261, + "sampling/sampling_logp_difference/max": 5.124405860900879, + "sampling/sampling_logp_difference/mean": 0.020061582326889038, + "step": 180 + }, + { + "clip_ratio/high_max": 1.0512151675357018e-05, + "clip_ratio/high_mean": 2.6280379188392544e-06, + "clip_ratio/low_mean": 4.5301517502593924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.792955542143318e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16106.0, + "completions/max_terminated_length": 16106.0, + "completions/mean_length": 5333.875, + "completions/mean_terminated_length": 5333.875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "entropy": 0.8107482865452766, + "epoch": 0.16651333946642136, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027016003150492907, + "learning_rate": 1e-05, + "loss": 0.0544, + "num_tokens": 140318935.0, + "reward": 0.5703125, + "reward_std": 0.2556639611721039, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000013828277588, + "sampling/importance_sampling_ratio/min": 0.006856904830783606, + "sampling/sampling_logp_difference/max": 4.982499122619629, + "sampling/sampling_logp_difference/mean": 0.017069874331355095, + "step": 181 + }, + { + "clip_ratio/high_max": 1.85085939392593e-05, + "clip_ratio/high_mean": 5.24943533264377e-06, + "clip_ratio/low_mean": 5.6120721524166584e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.137015702734061e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16050.0, + "completions/mean_length": 7443.3046875, + "completions/mean_terminated_length": 7154.89501953125, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 0.9224414080381393, + "epoch": 0.16743330266789327, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002655779244378209, + "learning_rate": 1e-05, + "loss": 0.0466, + "num_tokens": 141293534.0, + "reward": 0.234375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.00042018835665658116, + "sampling/sampling_logp_difference/max": 7.774807453155518, + "sampling/sampling_logp_difference/mean": 0.02006504125893116, + "step": 182 + }, + { + "clip_ratio/high_max": 1.494229445597739e-05, + "clip_ratio/high_mean": 3.7355736139943474e-06, + "clip_ratio/low_mean": 2.2748562741981004e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6484136355975352e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15923.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 5646.6875, + "completions/mean_terminated_length": 5646.6875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.8945339694619179, + "epoch": 0.16835326586936522, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0016281780553981662, + "learning_rate": 1e-05, + "loss": 0.0288, + "num_tokens": 142037438.0, + "reward": 0.46875, + "reward_std": 0.17912296950817108, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.0005717006279155612, + "sampling/sampling_logp_difference/max": 7.46689510345459, + "sampling/sampling_logp_difference/mean": 0.019336247816681862, + "step": 183 + }, + { + "clip_ratio/high_max": 3.335990868436056e-05, + "clip_ratio/high_mean": 8.33997717109014e-06, + "clip_ratio/low_mean": 3.5050728683927446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.339070608239126e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14142.0, + "completions/mean_length": 6384.640625, + "completions/mean_terminated_length": 5892.86865234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.840093269944191, + "epoch": 0.16927322907083717, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002166559686884284, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 142873848.0, + "reward": 0.4765625, + "reward_std": 0.35506346821784973, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000462532043457, + "sampling/importance_sampling_ratio/min": 4.785555574926548e-06, + "sampling/sampling_logp_difference/max": 12.249908447265625, + "sampling/sampling_logp_difference/mean": 0.018109092488884926, + "step": 184 + }, + { + "clip_ratio/high_max": 1.541105484648142e-05, + "clip_ratio/high_mean": 3.852763711620355e-06, + "clip_ratio/low_mean": 4.0552770769863855e-05, + "clip_ratio/low_min": 7.133888630050933e-06, + "clip_ratio/region_mean": 4.440553459517105e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14828.0, + "completions/mean_length": 5775.0, + "completions/mean_terminated_length": 5691.46435546875, + "completions/min_length": 1147.0, + "completions/min_terminated_length": 1147.0, + "entropy": 0.8915362879633904, + "epoch": 0.1701931922723091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021932912059128284, + "learning_rate": 1e-05, + "loss": -0.0086, + "num_tokens": 143636152.0, + "reward": 0.4375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 9.714113069492214e-09, + "sampling/sampling_logp_difference/max": 18.44968605041504, + "sampling/sampling_logp_difference/mean": 0.019278086721897125, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7509142171311396e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7509142171311396e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15122.0, + "completions/mean_length": 6181.640625, + "completions/mean_terminated_length": 6019.69873046875, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 1.0544511675834656, + "epoch": 0.17111315547378106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022947140969336033, + "learning_rate": 1e-05, + "loss": 0.0242, + "num_tokens": 144447370.0, + "reward": 0.234375, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999147653579712, + "sampling/importance_sampling_ratio/min": 7.419757253046555e-08, + "sampling/sampling_logp_difference/max": 16.416534423828125, + "sampling/sampling_logp_difference/mean": 0.02050788700580597, + "step": 186 + }, + { + "clip_ratio/high_max": 1.5700999938417226e-05, + "clip_ratio/high_mean": 3.9252499846043065e-06, + "clip_ratio/low_mean": 2.4595847037289786e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8521096965050674e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15824.0, + "completions/mean_length": 6542.3046875, + "completions/mean_terminated_length": 6306.1044921875, + "completions/min_length": 628.0, + "completions/min_terminated_length": 628.0, + "entropy": 0.933225467801094, + "epoch": 0.17203311867525298, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034910975955426693, + "learning_rate": 1e-05, + "loss": 0.0977, + "num_tokens": 145303505.0, + "reward": 0.390625, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.007213745731860399, + "sampling/sampling_logp_difference/max": 4.931766986846924, + "sampling/sampling_logp_difference/mean": 0.020022759214043617, + "step": 187 + }, + { + "clip_ratio/high_max": 6.0999414017715026e-06, + "clip_ratio/high_mean": 1.5249853504428756e-06, + "clip_ratio/low_mean": 2.61421698724007e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7667155109156738e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 5889.4765625, + "completions/mean_terminated_length": 5637.6083984375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.9649673849344254, + "epoch": 0.17295308187672492, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024078311398625374, + "learning_rate": 1e-05, + "loss": 0.0391, + "num_tokens": 146082198.0, + "reward": 0.3359375, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999341368675232, + "sampling/importance_sampling_ratio/min": 0.0008680344326421618, + "sampling/sampling_logp_difference/max": 7.04927921295166, + "sampling/sampling_logp_difference/mean": 0.02060198038816452, + "step": 188 + }, + { + "clip_ratio/high_max": 7.789618393871933e-06, + "clip_ratio/high_mean": 1.9474045984679833e-06, + "clip_ratio/low_mean": 3.6395756637830345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.834316100892465e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16233.0, + "completions/mean_length": 5349.2421875, + "completions/mean_terminated_length": 5084.408203125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.8402756005525589, + "epoch": 0.17387304507819687, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021191861014813185, + "learning_rate": 1e-05, + "loss": 0.1275, + "num_tokens": 146786245.0, + "reward": 0.4765625, + "reward_std": 0.2801200747489929, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 3.763807762879878e-05, + "sampling/sampling_logp_difference/max": 10.187494277954102, + "sampling/sampling_logp_difference/mean": 0.017112664878368378, + "step": 189 + }, + { + "clip_ratio/high_max": 1.2461773394534248e-05, + "clip_ratio/high_mean": 3.115443348633562e-06, + "clip_ratio/low_mean": 5.095924211673264e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.4074685294835945e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15786.0, + "completions/mean_length": 7272.3203125, + "completions/mean_terminated_length": 7053.64013671875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.9627499282360077, + "epoch": 0.17479300827966882, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022120666690170765, + "learning_rate": 1e-05, + "loss": 0.0079, + "num_tokens": 147737086.0, + "reward": 0.2890625, + "reward_std": 0.27304792404174805, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999538660049438, + "sampling/importance_sampling_ratio/min": 1.6960719221970066e-05, + "sampling/sampling_logp_difference/max": 10.984610557556152, + "sampling/sampling_logp_difference/mean": 0.0203307643532753, + "step": 190 + }, + { + "clip_ratio/high_max": 1.7891727566166082e-05, + "clip_ratio/high_mean": 4.472931891541521e-06, + "clip_ratio/low_mean": 5.616715043288423e-05, + "clip_ratio/low_min": 7.80031223257538e-06, + "clip_ratio/region_mean": 6.064008221073891e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16212.0, + "completions/mean_length": 6387.1875, + "completions/mean_terminated_length": 5895.54052734375, + "completions/min_length": 1310.0, + "completions/min_terminated_length": 1310.0, + "entropy": 0.9110158830881119, + "epoch": 0.17571297148114076, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030851473566144705, + "learning_rate": 1e-05, + "loss": 0.1091, + "num_tokens": 148573782.0, + "reward": 0.40625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99997878074646, + "sampling/importance_sampling_ratio/min": 0.003961040172725916, + "sampling/sampling_logp_difference/max": 5.531248569488525, + "sampling/sampling_logp_difference/mean": 0.018049638718366623, + "step": 191 + }, + { + "clip_ratio/high_max": 1.6994396901282016e-05, + "clip_ratio/high_mean": 5.400205964178895e-06, + "clip_ratio/low_mean": 3.274822392995702e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8148429439388565e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7267.59375, + "completions/mean_terminated_length": 7195.81103515625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.9254888147115707, + "epoch": 0.1766329346826127, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020694085396826267, + "learning_rate": 1e-05, + "loss": 0.0462, + "num_tokens": 149521258.0, + "reward": 0.2734375, + "reward_std": 0.29719972610473633, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999054670333862, + "sampling/importance_sampling_ratio/min": 7.411616934405174e-06, + "sampling/sampling_logp_difference/max": 11.812461853027344, + "sampling/sampling_logp_difference/mean": 0.01898832805454731, + "step": 192 + }, + { + "clip_ratio/high_max": 4.10414668294834e-06, + "clip_ratio/high_mean": 1.026036670737085e-06, + "clip_ratio/low_mean": 4.7441100377909606e-05, + "clip_ratio/low_min": 4.552241534838686e-06, + "clip_ratio/region_mean": 4.8467136821273016e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16076.0, + "completions/mean_length": 7100.1953125, + "completions/mean_terminated_length": 6952.83349609375, + "completions/min_length": 560.0, + "completions/min_terminated_length": 560.0, + "entropy": 0.8455610796809196, + "epoch": 0.17755289788408463, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003085972974076867, + "learning_rate": 1e-05, + "loss": 0.0108, + "num_tokens": 150447923.0, + "reward": 0.25, + "reward_std": 0.23645778000354767, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.0011708807433024049, + "sampling/sampling_logp_difference/max": 6.749999046325684, + "sampling/sampling_logp_difference/mean": 0.01974140852689743, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.6514521121280268e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6514521121280268e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15535.0, + "completions/mean_length": 6626.4296875, + "completions/mean_terminated_length": 6549.5986328125, + "completions/min_length": 1746.0, + "completions/min_terminated_length": 1746.0, + "entropy": 1.0323699787259102, + "epoch": 0.17847286108555657, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.003505800850689411, + "learning_rate": 1e-05, + "loss": 0.0885, + "num_tokens": 151313834.0, + "reward": 0.390625, + "reward_std": 0.17176413536071777, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999381303787231, + "sampling/importance_sampling_ratio/min": 2.8102756914449856e-05, + "sampling/sampling_logp_difference/max": 10.479642868041992, + "sampling/sampling_logp_difference/mean": 0.021082937717437744, + "step": 194 + }, + { + "clip_ratio/high_max": 2.006086378969485e-05, + "clip_ratio/high_mean": 5.890002398700744e-06, + "clip_ratio/low_mean": 3.503898199141986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.092898473118112e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 7093.109375, + "completions/mean_terminated_length": 6870.12841796875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 1.0206764563918114, + "epoch": 0.17939282428702852, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002495395252481103, + "learning_rate": 1e-05, + "loss": 0.0308, + "num_tokens": 152238192.0, + "reward": 0.2890625, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 9.536534344078973e-05, + "sampling/sampling_logp_difference/max": 9.257795333862305, + "sampling/sampling_logp_difference/mean": 0.020610272884368896, + "step": 195 + }, + { + "clip_ratio/high_max": 3.2352409107261337e-06, + "clip_ratio/high_mean": 8.088102276815334e-07, + "clip_ratio/low_mean": 4.056704699451075e-05, + "clip_ratio/low_min": 1.1648833606159315e-05, + "clip_ratio/region_mean": 4.1375856994818605e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14191.0, + "completions/mean_length": 6795.71875, + "completions/mean_terminated_length": 6486.4189453125, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "entropy": 0.8927837759256363, + "epoch": 0.18031278748850046, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014066790463402867, + "learning_rate": 1e-05, + "loss": -0.0031, + "num_tokens": 153131828.0, + "reward": 0.3359375, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998855590820312, + "sampling/importance_sampling_ratio/min": 5.093755135021638e-06, + "sampling/sampling_logp_difference/max": 12.187495231628418, + "sampling/sampling_logp_difference/mean": 0.01874586008489132, + "step": 196 + }, + { + "clip_ratio/high_max": 1.5244630048982799e-05, + "clip_ratio/high_mean": 3.8111575122456998e-06, + "clip_ratio/low_mean": 3.655197178886738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.03631290737394e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15831.0, + "completions/mean_length": 7075.1015625, + "completions/mean_terminated_length": 6617.28662109375, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.8989318311214447, + "epoch": 0.1812327506899724, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0017937121447175741, + "learning_rate": 1e-05, + "loss": 0.0359, + "num_tokens": 154057097.0, + "reward": 0.3984375, + "reward_std": 0.23068872094154358, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.00021659507183358073, + "sampling/sampling_logp_difference/max": 8.437480926513672, + "sampling/sampling_logp_difference/mean": 0.01890135183930397, + "step": 197 + }, + { + "clip_ratio/high_max": 1.4074375030759256e-05, + "clip_ratio/high_mean": 4.977033995601232e-06, + "clip_ratio/low_mean": 3.2670792506905855e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.764782627513341e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14100.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6743.41455078125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.8758384585380554, + "epoch": 0.18215271389144433, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003410576842725277, + "learning_rate": 1e-05, + "loss": 0.0536, + "num_tokens": 154988585.0, + "reward": 0.3984375, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999953508377075, + "sampling/importance_sampling_ratio/min": 0.003589102067053318, + "sampling/sampling_logp_difference/max": 5.629853248596191, + "sampling/sampling_logp_difference/mean": 0.018400676548480988, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.977112736994968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.977112736994968e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15821.0, + "completions/mean_length": 6590.6796875, + "completions/mean_terminated_length": 6513.56689453125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9243742749094963, + "epoch": 0.18307267709291627, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003304310142993927, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 155851000.0, + "reward": 0.3984375, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579787254333, + "sampling/importance_sampling_ratio/min": 1.2693599273916334e-06, + "sampling/sampling_logp_difference/max": 13.576997756958008, + "sampling/sampling_logp_difference/mean": 0.01959652081131935, + "step": 199 + }, + { + "clip_ratio/high_max": 1.1435367014200892e-05, + "clip_ratio/high_mean": 2.858841753550223e-06, + "clip_ratio/low_mean": 4.7742656533955596e-05, + "clip_ratio/low_min": 8.646529749967158e-06, + "clip_ratio/region_mean": 5.0601498060132144e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16115.0, + "completions/mean_length": 6999.484375, + "completions/mean_terminated_length": 6696.7578125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.843244343996048, + "epoch": 0.18399264029438822, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023830258287489414, + "learning_rate": 1e-05, + "loss": 0.1142, + "num_tokens": 156766782.0, + "reward": 0.359375, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998635053634644, + "sampling/importance_sampling_ratio/min": 0.00014761318743694574, + "sampling/sampling_logp_difference/max": 8.820915222167969, + "sampling/sampling_logp_difference/mean": 0.018434934318065643, + "step": 200 + }, + { + "clip_ratio/high_max": 2.5114631171163637e-05, + "clip_ratio/high_mean": 7.040741365926806e-06, + "clip_ratio/low_mean": 5.3607667723554187e-05, + "clip_ratio/low_min": 9.219345429301029e-06, + "clip_ratio/region_mean": 6.064840863473364e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14986.0, + "completions/mean_length": 6407.5, + "completions/mean_terminated_length": 6249.14306640625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.9549195989966393, + "epoch": 0.18491260349586017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024427250027656555, + "learning_rate": 1e-05, + "loss": 0.0795, + "num_tokens": 157606126.0, + "reward": 0.3515625, + "reward_std": 0.32879000902175903, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966025352478, + "sampling/importance_sampling_ratio/min": 0.0002305622911080718, + "sampling/sampling_logp_difference/max": 8.37498950958252, + "sampling/sampling_logp_difference/mean": 0.0192743968218565, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.928529067958152e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.928529067958152e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6638.390625, + "completions/mean_terminated_length": 5901.328125, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.9070822075009346, + "epoch": 0.1858325666973321, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002024515997618437, + "learning_rate": 1e-05, + "loss": 0.0604, + "num_tokens": 158474248.0, + "reward": 0.4140625, + "reward_std": 0.28117600083351135, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999830722808838, + "sampling/importance_sampling_ratio/min": 0.0036068728659301996, + "sampling/sampling_logp_difference/max": 5.624914169311523, + "sampling/sampling_logp_difference/mean": 0.01955476775765419, + "step": 202 + }, + { + "clip_ratio/high_max": 8.365173471247545e-06, + "clip_ratio/high_mean": 2.091293367811886e-06, + "clip_ratio/low_mean": 4.1470637825113954e-05, + "clip_ratio/low_min": 4.027710474474588e-06, + "clip_ratio/region_mean": 4.356193130661268e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15708.0, + "completions/mean_length": 7324.546875, + "completions/mean_terminated_length": 6878.99951171875, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.9108889549970627, + "epoch": 0.18675252989880406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0022787705529481173, + "learning_rate": 1e-05, + "loss": 0.0616, + "num_tokens": 159434350.0, + "reward": 0.3359375, + "reward_std": 0.26515230536460876, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999351501464844, + "sampling/importance_sampling_ratio/min": 0.03948089852929115, + "sampling/sampling_logp_difference/max": 3.231938362121582, + "sampling/sampling_logp_difference/mean": 0.019122496247291565, + "step": 203 + }, + { + "clip_ratio/high_max": 8.65733409227687e-06, + "clip_ratio/high_mean": 2.1643335230692173e-06, + "clip_ratio/low_mean": 3.456336048657249e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.672769389595487e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13983.0, + "completions/mean_length": 5520.4453125, + "completions/mean_terminated_length": 5434.9052734375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.8982062339782715, + "epoch": 0.18767249310027598, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0026195270475000143, + "learning_rate": 1e-05, + "loss": 0.049, + "num_tokens": 160163055.0, + "reward": 0.4375, + "reward_std": 0.24831004440784454, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.0005541297141462564, + "sampling/sampling_logp_difference/max": 7.498111724853516, + "sampling/sampling_logp_difference/mean": 0.019064132124185562, + "step": 204 + }, + { + "clip_ratio/high_max": 1.8376186289970065e-05, + "clip_ratio/high_mean": 6.650576210631698e-06, + "clip_ratio/low_mean": 4.059042771586974e-05, + "clip_ratio/low_min": 5.350111223378917e-06, + "clip_ratio/region_mean": 4.724100449493562e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15267.0, + "completions/max_terminated_length": 15267.0, + "completions/mean_length": 6846.515625, + "completions/mean_terminated_length": 6846.515625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.9657742157578468, + "epoch": 0.18859245630174792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0014831912703812122, + "learning_rate": 1e-05, + "loss": 0.006, + "num_tokens": 161057657.0, + "reward": 0.296875, + "reward_std": 0.27198708057403564, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999252557754517, + "sampling/importance_sampling_ratio/min": 6.252834282349795e-05, + "sampling/sampling_logp_difference/max": 9.679890632629395, + "sampling/sampling_logp_difference/mean": 0.020372584462165833, + "step": 205 + }, + { + "clip_ratio/high_max": 1.658901419432368e-05, + "clip_ratio/high_mean": 4.14725354858092e-06, + "clip_ratio/low_mean": 4.473214539757464e-05, + "clip_ratio/low_min": 2.9674999950657366e-06, + "clip_ratio/region_mean": 4.887939894615556e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16370.0, + "completions/mean_length": 6946.8984375, + "completions/mean_terminated_length": 6642.4755859375, + "completions/min_length": 1133.0, + "completions/min_terminated_length": 1133.0, + "entropy": 0.8490508273243904, + "epoch": 0.18951241950321987, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017962189158424735, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 161966356.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999545216560364, + "sampling/importance_sampling_ratio/min": 7.035569433355704e-05, + "sampling/sampling_logp_difference/max": 9.561946868896484, + "sampling/sampling_logp_difference/mean": 0.019146796315908432, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.22491199540309e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.22491199540309e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15123.0, + "completions/mean_length": 6618.9765625, + "completions/mean_terminated_length": 6463.9765625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.9541772454977036, + "epoch": 0.19043238270469182, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0017619321588426828, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 162836705.0, + "reward": 0.390625, + "reward_std": 0.2130674123764038, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 4.2106199771296815e-07, + "sampling/sampling_logp_difference/max": 14.680485725402832, + "sampling/sampling_logp_difference/mean": 0.020236656069755554, + "step": 207 + }, + { + "clip_ratio/high_max": 1.6846054222696694e-05, + "clip_ratio/high_mean": 4.211513555674173e-06, + "clip_ratio/low_mean": 3.877300162002939e-05, + "clip_ratio/low_min": 4.230834292684449e-06, + "clip_ratio/region_mean": 4.298451551676408e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12469.0, + "completions/mean_length": 5485.71875, + "completions/mean_terminated_length": 5312.73046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.8888534903526306, + "epoch": 0.19135234590616376, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002670915797352791, + "learning_rate": 1e-05, + "loss": 0.0709, + "num_tokens": 163558197.0, + "reward": 0.46875, + "reward_std": 0.3145885467529297, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.0005042250850237906, + "sampling/sampling_logp_difference/max": 7.592487812042236, + "sampling/sampling_logp_difference/mean": 0.019581373780965805, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.6889288480779214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.6889288480779214e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16184.0, + "completions/mean_length": 4345.171875, + "completions/mean_terminated_length": 4250.3779296875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.8308270424604416, + "epoch": 0.1922723091076357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004005427472293377, + "learning_rate": 1e-05, + "loss": 0.1072, + "num_tokens": 164133499.0, + "reward": 0.578125, + "reward_std": 0.31642353534698486, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999247193336487, + "sampling/importance_sampling_ratio/min": 0.022981969639658928, + "sampling/sampling_logp_difference/max": 3.773045301437378, + "sampling/sampling_logp_difference/mean": 0.017508968710899353, + "step": 209 + }, + { + "clip_ratio/high_max": 1.2997116300539346e-05, + "clip_ratio/high_mean": 3.2492790751348366e-06, + "clip_ratio/low_mean": 2.723402121773688e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0483300406558556e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15509.0, + "completions/mean_length": 5227.296875, + "completions/mean_terminated_length": 5050.20654296875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 0.9231975972652435, + "epoch": 0.19319227230910763, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0031033784616738558, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 164823681.0, + "reward": 0.4765625, + "reward_std": 0.29249146580696106, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999896764755249, + "sampling/importance_sampling_ratio/min": 0.0021342060063034296, + "sampling/sampling_logp_difference/max": 6.149660587310791, + "sampling/sampling_logp_difference/mean": 0.019171088933944702, + "step": 210 + }, + { + "clip_ratio/high_max": 2.0835890609305352e-05, + "clip_ratio/high_mean": 5.208972652326338e-06, + "clip_ratio/low_mean": 2.9314877565411734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.452385044511175e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14160.0, + "completions/mean_length": 6473.4765625, + "completions/mean_terminated_length": 6316.1669921875, + "completions/min_length": 726.0, + "completions/min_terminated_length": 726.0, + "entropy": 0.9061874598264694, + "epoch": 0.19411223551057957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003495733719319105, + "learning_rate": 1e-05, + "loss": 0.0785, + "num_tokens": 165668798.0, + "reward": 0.4765625, + "reward_std": 0.3469353914260864, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000354051589966, + "sampling/importance_sampling_ratio/min": 0.0004697878030128777, + "sampling/sampling_logp_difference/max": 7.663229465484619, + "sampling/sampling_logp_difference/mean": 0.018978482112288475, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.991967162164656e-05, + "clip_ratio/low_min": 6.304534053924726e-06, + "clip_ratio/region_mean": 3.991967162164656e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14659.0, + "completions/mean_length": 7140.1953125, + "completions/mean_terminated_length": 6605.4296875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.9605444446206093, + "epoch": 0.19503219871205152, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002381941769272089, + "learning_rate": 1e-05, + "loss": 0.027, + "num_tokens": 166603375.0, + "reward": 0.3046875, + "reward_std": 0.27776598930358887, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999864935874939, + "sampling/importance_sampling_ratio/min": 0.00043123820796608925, + "sampling/sampling_logp_difference/max": 7.748849868774414, + "sampling/sampling_logp_difference/mean": 0.021141134202480316, + "step": 212 + }, + { + "clip_ratio/high_max": 1.4948576790629886e-05, + "clip_ratio/high_mean": 3.7371441976574715e-06, + "clip_ratio/low_mean": 3.4953729482367635e-05, + "clip_ratio/low_min": 3.991060111729894e-06, + "clip_ratio/region_mean": 3.869087413477246e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13770.0, + "completions/mean_length": 5304.46875, + "completions/mean_terminated_length": 5038.56005859375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.9176690131425858, + "epoch": 0.19595216191352346, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0040566748939454556, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 167302275.0, + "reward": 0.4296875, + "reward_std": 0.33114415407180786, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999827742576599, + "sampling/importance_sampling_ratio/min": 5.001809313398553e-07, + "sampling/sampling_logp_difference/max": 14.508296012878418, + "sampling/sampling_logp_difference/mean": 0.018822530284523964, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.653866999935417e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.653866999935417e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15791.0, + "completions/mean_length": 5796.5, + "completions/mean_terminated_length": 5542.400390625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.9230027198791504, + "epoch": 0.1968721251149954, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021502040326595306, + "learning_rate": 1e-05, + "loss": 0.0737, + "num_tokens": 168063627.0, + "reward": 0.3828125, + "reward_std": 0.3158818483352661, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999223351478577, + "sampling/importance_sampling_ratio/min": 0.009504453279078007, + "sampling/sampling_logp_difference/max": 4.655994892120361, + "sampling/sampling_logp_difference/mean": 0.01985779032111168, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0863841453101486e-05, + "clip_ratio/high_mean": 2.7159603632753715e-06, + "clip_ratio/low_mean": 2.4175752741939505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6891713218901714e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14814.0, + "completions/mean_length": 6135.4921875, + "completions/mean_terminated_length": 6054.79541015625, + "completions/min_length": 1259.0, + "completions/min_terminated_length": 1259.0, + "entropy": 0.869445689022541, + "epoch": 0.19779208831646733, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0027786416467279196, + "learning_rate": 1e-05, + "loss": 0.0139, + "num_tokens": 168867858.0, + "reward": 0.4609375, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999550580978394, + "sampling/importance_sampling_ratio/min": 2.6089865059475414e-05, + "sampling/sampling_logp_difference/max": 10.553963661193848, + "sampling/sampling_logp_difference/mean": 0.018514130264520645, + "step": 215 + }, + { + "clip_ratio/high_max": 4.36788013757905e-06, + "clip_ratio/high_mean": 1.0919700343947625e-06, + "clip_ratio/low_mean": 1.993327998661698e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0852980330564606e-06, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15691.0, + "completions/mean_length": 6268.2421875, + "completions/mean_terminated_length": 6025.46435546875, + "completions/min_length": 627.0, + "completions/min_terminated_length": 627.0, + "entropy": 0.951081782579422, + "epoch": 0.19871205151793928, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0007328780484385788, + "learning_rate": 1e-05, + "loss": 0.0188, + "num_tokens": 169689969.0, + "reward": 0.3828125, + "reward_std": 0.10994865000247955, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 1.6650999896228313e-05, + "sampling/sampling_logp_difference/max": 11.003040313720703, + "sampling/sampling_logp_difference/mean": 0.02005261555314064, + "step": 216 + }, + { + "clip_ratio/high_max": 2.131336282218399e-05, + "clip_ratio/high_mean": 5.3283407055459975e-06, + "clip_ratio/low_mean": 3.5254403428552905e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.058274430462916e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13861.0, + "completions/mean_length": 5440.8984375, + "completions/mean_terminated_length": 5354.732421875, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.8271932750940323, + "epoch": 0.19963201471941122, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034721922129392624, + "learning_rate": 1e-05, + "loss": -0.0245, + "num_tokens": 170409292.0, + "reward": 0.53125, + "reward_std": 0.30327308177948, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998912811279297, + "sampling/importance_sampling_ratio/min": 1.8372484191786498e-05, + "sampling/sampling_logp_difference/max": 10.904656410217285, + "sampling/sampling_logp_difference/mean": 0.019136395305395126, + "step": 217 + }, + { + "clip_ratio/high_max": 1.2339016848272877e-05, + "clip_ratio/high_mean": 4.13687178024702e-06, + "clip_ratio/low_mean": 2.156280152121326e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.569967330146028e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15086.0, + "completions/mean_length": 6671.046875, + "completions/mean_terminated_length": 6594.56689453125, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "entropy": 0.9659745842218399, + "epoch": 0.20055197792088317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027575206477195024, + "learning_rate": 1e-05, + "loss": 0.0286, + "num_tokens": 171280714.0, + "reward": 0.375, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999411702156067, + "sampling/importance_sampling_ratio/min": 1.5700872609158978e-05, + "sampling/sampling_logp_difference/max": 11.06179428100586, + "sampling/sampling_logp_difference/mean": 0.019089506939053535, + "step": 218 + }, + { + "clip_ratio/high_max": 1.4603458112105727e-05, + "clip_ratio/high_mean": 3.650864528026432e-06, + "clip_ratio/low_mean": 3.2977761520669446e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.662862599185246e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7781.5546875, + "completions/mean_terminated_length": 7504.05615234375, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 1.1691131889820099, + "epoch": 0.2014719411223551, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0012711051385849714, + "learning_rate": 1e-05, + "loss": 0.0115, + "num_tokens": 172302489.0, + "reward": 0.109375, + "reward_std": 0.1751839816570282, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998820424079895, + "sampling/importance_sampling_ratio/min": 0.005086081102490425, + "sampling/sampling_logp_difference/max": 5.281247615814209, + "sampling/sampling_logp_difference/mean": 0.023309212177991867, + "step": 219 + }, + { + "clip_ratio/high_max": 6.842087486802484e-06, + "clip_ratio/high_mean": 1.710521871700621e-06, + "clip_ratio/low_mean": 4.5269940528669395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6980462457213434e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14891.0, + "completions/mean_length": 6489.96875, + "completions/mean_terminated_length": 6332.9208984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.9354017227888107, + "epoch": 0.20239190432382706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016933141741901636, + "learning_rate": 1e-05, + "loss": 0.0156, + "num_tokens": 173149653.0, + "reward": 0.484375, + "reward_std": 0.32325342297554016, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 0.008998609147965908, + "sampling/sampling_logp_difference/max": 4.7106852531433105, + "sampling/sampling_logp_difference/mean": 0.019165027886629105, + "step": 220 + }, + { + "clip_ratio/high_max": 2.444740721330163e-05, + "clip_ratio/high_mean": 6.111851803325408e-06, + "clip_ratio/low_mean": 3.0998270403870265e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.711012095664046e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14943.0, + "completions/max_terminated_length": 14943.0, + "completions/mean_length": 6309.75, + "completions/mean_terminated_length": 6309.75, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 1.012483686208725, + "epoch": 0.20331186752529898, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024940327275544405, + "learning_rate": 1e-05, + "loss": 0.0552, + "num_tokens": 173976797.0, + "reward": 0.4375, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 0.0018720829393714666, + "sampling/sampling_logp_difference/max": 6.280703544616699, + "sampling/sampling_logp_difference/mean": 0.020797956734895706, + "step": 221 + }, + { + "clip_ratio/high_max": 1.1112337460872368e-05, + "clip_ratio/high_mean": 3.5388877677178243e-06, + "clip_ratio/low_mean": 1.7024583712554886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.056347148027271e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7574.984375, + "completions/mean_terminated_length": 7363.568359375, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9144782647490501, + "epoch": 0.20423183072677092, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002748408354818821, + "learning_rate": 1e-05, + "loss": 0.0588, + "num_tokens": 174965259.0, + "reward": 0.2734375, + "reward_std": 0.25224411487579346, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000108480453491, + "sampling/importance_sampling_ratio/min": 0.005681300535798073, + "sampling/sampling_logp_difference/max": 5.170575141906738, + "sampling/sampling_logp_difference/mean": 0.019229793921113014, + "step": 222 + }, + { + "clip_ratio/high_max": 1.4946090004741563e-05, + "clip_ratio/high_mean": 3.736522501185391e-06, + "clip_ratio/low_mean": 3.722507381098694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.096159636901575e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15365.0, + "completions/mean_length": 6962.7734375, + "completions/mean_terminated_length": 6499.43408203125, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.9248140156269073, + "epoch": 0.20515179392824287, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020343128126114607, + "learning_rate": 1e-05, + "loss": 0.0714, + "num_tokens": 175876446.0, + "reward": 0.421875, + "reward_std": 0.3156445026397705, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999679327011108, + "sampling/importance_sampling_ratio/min": 0.0001609467581147328, + "sampling/sampling_logp_difference/max": 8.734436988830566, + "sampling/sampling_logp_difference/mean": 0.01860032044351101, + "step": 223 + }, + { + "clip_ratio/high_max": 4.226114015182247e-06, + "clip_ratio/high_mean": 1.0565285037955618e-06, + "clip_ratio/low_mean": 3.189400638348161e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.295053488727717e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14978.0, + "completions/mean_length": 6422.28125, + "completions/mean_terminated_length": 6264.1591796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.7786787301301956, + "epoch": 0.20607175712971482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029119597747921944, + "learning_rate": 1e-05, + "loss": 0.1116, + "num_tokens": 176717226.0, + "reward": 0.578125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.578125, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.0006287595024332404, + "sampling/sampling_logp_difference/max": 7.371761798858643, + "sampling/sampling_logp_difference/mean": 0.01786171644926071, + "step": 224 + }, + { + "clip_ratio/high_max": 5.4112551879370585e-06, + "clip_ratio/high_mean": 1.3528137969842646e-06, + "clip_ratio/low_mean": 2.103693077515345e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2389744572137715e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16030.0, + "completions/mean_length": 6662.65625, + "completions/mean_terminated_length": 6508.349609375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.9501350447535515, + "epoch": 0.20699172033118676, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0027519147843122482, + "learning_rate": 1e-05, + "loss": 0.0204, + "num_tokens": 177586766.0, + "reward": 0.421875, + "reward_std": 0.21382881700992584, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000051259994507, + "sampling/importance_sampling_ratio/min": 2.507045428501442e-05, + "sampling/sampling_logp_difference/max": 10.593820571899414, + "sampling/sampling_logp_difference/mean": 0.020679686218500137, + "step": 225 + }, + { + "clip_ratio/high_max": 3.2487785119883483e-06, + "clip_ratio/high_mean": 8.121946279970871e-07, + "clip_ratio/low_mean": 5.783435085504607e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.8646545539886574e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15419.0, + "completions/mean_length": 6546.171875, + "completions/mean_terminated_length": 6146.259765625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "entropy": 0.9217342138290405, + "epoch": 0.20791168353265868, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017936143558472395, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 178444556.0, + "reward": 0.3984375, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000327825546265, + "sampling/importance_sampling_ratio/min": 8.447741129202768e-05, + "sampling/sampling_logp_difference/max": 9.379026412963867, + "sampling/sampling_logp_difference/mean": 0.019764548167586327, + "step": 226 + }, + { + "clip_ratio/high_max": 2.1980493102091714e-05, + "clip_ratio/high_mean": 5.4951232755229285e-06, + "clip_ratio/low_mean": 4.3977801396977156e-05, + "clip_ratio/low_min": 7.912247156127705e-06, + "clip_ratio/region_mean": 4.947292427459615e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15707.0, + "completions/max_terminated_length": 15707.0, + "completions/mean_length": 6433.9296875, + "completions/mean_terminated_length": 6433.9296875, + "completions/min_length": 731.0, + "completions/min_terminated_length": 731.0, + "entropy": 0.9361409991979599, + "epoch": 0.20883164673413063, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031324021983891726, + "learning_rate": 1e-05, + "loss": 0.0505, + "num_tokens": 179288499.0, + "reward": 0.453125, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.00018488657951820642, + "sampling/sampling_logp_difference/max": 8.595767974853516, + "sampling/sampling_logp_difference/mean": 0.019691072404384613, + "step": 227 + }, + { + "clip_ratio/high_max": 1.299416817346355e-05, + "clip_ratio/high_mean": 3.2485420433658874e-06, + "clip_ratio/low_mean": 3.756406420052372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.081260635757644e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15787.0, + "completions/mean_length": 6037.75, + "completions/mean_terminated_length": 5873.52392578125, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.8700985535979271, + "epoch": 0.20975160993560257, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024714914616197348, + "learning_rate": 1e-05, + "loss": 0.0044, + "num_tokens": 180079619.0, + "reward": 0.484375, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999628067016602, + "sampling/importance_sampling_ratio/min": 8.4841696661897e-05, + "sampling/sampling_logp_difference/max": 9.374723434448242, + "sampling/sampling_logp_difference/mean": 0.018519341945648193, + "step": 228 + }, + { + "clip_ratio/high_max": 7.293307589861797e-06, + "clip_ratio/high_mean": 1.8233268974654493e-06, + "clip_ratio/low_mean": 2.2305866423266707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.412919320704532e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12264.0, + "completions/max_terminated_length": 12264.0, + "completions/mean_length": 5305.828125, + "completions/mean_terminated_length": 5305.828125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 1.1309608668088913, + "epoch": 0.21067157313707452, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003593914210796356, + "learning_rate": 1e-05, + "loss": 0.0478, + "num_tokens": 180780877.0, + "reward": 0.3984375, + "reward_std": 0.24671241641044617, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011920928955, + "sampling/importance_sampling_ratio/min": 0.009941472671926022, + "sampling/sampling_logp_difference/max": 4.611040115356445, + "sampling/sampling_logp_difference/mean": 0.020471621304750443, + "step": 229 + }, + { + "clip_ratio/high_max": 2.0163415001661633e-05, + "clip_ratio/high_mean": 5.040853750415408e-06, + "clip_ratio/low_mean": 4.4980357415624894e-05, + "clip_ratio/low_min": 1.0012816346716136e-05, + "clip_ratio/region_mean": 5.0021211109196884e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13814.0, + "completions/mean_length": 6022.96875, + "completions/mean_terminated_length": 5774.30419921875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.8560900762677193, + "epoch": 0.21159153633854647, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029816587921231985, + "learning_rate": 1e-05, + "loss": 0.0913, + "num_tokens": 181571465.0, + "reward": 0.515625, + "reward_std": 0.41504397988319397, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999518394470215, + "sampling/importance_sampling_ratio/min": 1.5958334188326262e-05, + "sampling/sampling_logp_difference/max": 11.04552936553955, + "sampling/sampling_logp_difference/mean": 0.0181986466050148, + "step": 230 + }, + { + "clip_ratio/high_max": 1.8430865566188004e-05, + "clip_ratio/high_mean": 6.177042905619601e-06, + "clip_ratio/low_mean": 4.450247388376738e-05, + "clip_ratio/low_min": 4.840271230932558e-06, + "clip_ratio/region_mean": 5.067951724413433e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15130.0, + "completions/max_terminated_length": 15130.0, + "completions/mean_length": 6647.71875, + "completions/mean_terminated_length": 6647.71875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.9455481320619583, + "epoch": 0.2125114995400184, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0031632622703909874, + "learning_rate": 1e-05, + "loss": 0.1317, + "num_tokens": 182440957.0, + "reward": 0.3828125, + "reward_std": 0.39902517199516296, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 1.4739508515049238e-05, + "sampling/sampling_logp_difference/max": 11.124979019165039, + "sampling/sampling_logp_difference/mean": 0.01906408555805683, + "step": 231 + }, + { + "clip_ratio/high_max": 2.2937053017813014e-05, + "clip_ratio/high_mean": 5.7342632544532535e-06, + "clip_ratio/low_mean": 6.042617155799235e-05, + "clip_ratio/low_min": 1.1000354334100848e-05, + "clip_ratio/region_mean": 6.616043401663774e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15988.0, + "completions/mean_length": 6809.1640625, + "completions/mean_terminated_length": 6500.29833984375, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "entropy": 1.050546184182167, + "epoch": 0.21343146274149033, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.00162694591563195, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 183332242.0, + "reward": 0.421875, + "reward_std": 0.33616161346435547, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000290870666504, + "sampling/importance_sampling_ratio/min": 4.244970114086755e-06, + "sampling/sampling_logp_difference/max": 12.369775772094727, + "sampling/sampling_logp_difference/mean": 0.021866722032427788, + "step": 232 + }, + { + "clip_ratio/high_max": 1.4678411844215589e-05, + "clip_ratio/high_mean": 3.669602961053897e-06, + "clip_ratio/low_mean": 2.4373607971028832e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8043211159456405e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16348.0, + "completions/mean_length": 6815.5, + "completions/mean_terminated_length": 6506.83837890625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 1.060033954679966, + "epoch": 0.21435142594296228, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024887355975806713, + "learning_rate": 1e-05, + "loss": 0.1059, + "num_tokens": 184225138.0, + "reward": 0.328125, + "reward_std": 0.2869548499584198, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999393820762634, + "sampling/importance_sampling_ratio/min": 0.00012930770753882825, + "sampling/sampling_logp_difference/max": 8.953315734863281, + "sampling/sampling_logp_difference/mean": 0.02019432932138443, + "step": 233 + }, + { + "clip_ratio/high_max": 7.910891326901037e-06, + "clip_ratio/high_mean": 1.9777228317252593e-06, + "clip_ratio/low_mean": 3.8802519611635944e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.078024221598753e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15838.0, + "completions/mean_length": 6928.4453125, + "completions/mean_terminated_length": 6623.42724609375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.9051575735211372, + "epoch": 0.21527138914443422, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002783838426694274, + "learning_rate": 1e-05, + "loss": 0.0624, + "num_tokens": 185136323.0, + "reward": 0.3359375, + "reward_std": 0.25460803508758545, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999524354934692, + "sampling/importance_sampling_ratio/min": 1.0146355634788051e-05, + "sampling/sampling_logp_difference/max": 11.498395919799805, + "sampling/sampling_logp_difference/mean": 0.01905050128698349, + "step": 234 + }, + { + "clip_ratio/high_max": 4.399394583742833e-06, + "clip_ratio/high_mean": 1.0998486459357082e-06, + "clip_ratio/low_mean": 1.733424267058581e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.8434091430208355e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14703.0, + "completions/mean_length": 7155.1328125, + "completions/mean_terminated_length": 7082.46435546875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 1.0119014978408813, + "epoch": 0.21619135234590617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002105508930981159, + "learning_rate": 1e-05, + "loss": 0.0655, + "num_tokens": 186071324.0, + "reward": 0.328125, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999904990196228, + "sampling/importance_sampling_ratio/min": 0.003494206117466092, + "sampling/sampling_logp_difference/max": 5.656649112701416, + "sampling/sampling_logp_difference/mean": 0.020860780030488968, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0561529961705673e-05, + "clip_ratio/high_mean": 3.4390433256703545e-06, + "clip_ratio/low_mean": 2.8499469067355676e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.193851205196552e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16176.0, + "completions/max_terminated_length": 16176.0, + "completions/mean_length": 7463.2421875, + "completions/mean_terminated_length": 7463.2421875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.9983502700924873, + "epoch": 0.21711131554737811, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013582308311015368, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 187045035.0, + "reward": 0.3984375, + "reward_std": 0.2517249584197998, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999428987503052, + "sampling/importance_sampling_ratio/min": 0.000473080639494583, + "sampling/sampling_logp_difference/max": 7.65624475479126, + "sampling/sampling_logp_difference/mean": 0.021131811663508415, + "step": 236 + }, + { + "clip_ratio/high_max": 8.509013468938065e-06, + "clip_ratio/high_mean": 2.127253367234516e-06, + "clip_ratio/low_mean": 3.985050443588989e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.197775751890731e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14938.0, + "completions/mean_length": 6460.984375, + "completions/mean_terminated_length": 6382.8505859375, + "completions/min_length": 1747.0, + "completions/min_terminated_length": 1747.0, + "entropy": 0.7869217246770859, + "epoch": 0.21803127874885003, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002681629965081811, + "learning_rate": 1e-05, + "loss": 0.0987, + "num_tokens": 187889609.0, + "reward": 0.5234375, + "reward_std": 0.39082521200180054, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999568462371826, + "sampling/importance_sampling_ratio/min": 0.0015037209959700704, + "sampling/sampling_logp_difference/max": 6.499812602996826, + "sampling/sampling_logp_difference/mean": 0.016937749460339546, + "step": 237 + }, + { + "clip_ratio/high_max": 1.2362176221358823e-05, + "clip_ratio/high_mean": 3.0905440553397057e-06, + "clip_ratio/low_mean": 5.0333514764133724e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.342405825103924e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 6241.78125, + "completions/mean_terminated_length": 6161.92138671875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.0217387825250626, + "epoch": 0.21895124195032198, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021239183843135834, + "learning_rate": 1e-05, + "loss": 0.0353, + "num_tokens": 188706605.0, + "reward": 0.2578125, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999796748161316, + "sampling/importance_sampling_ratio/min": 0.004853047896176577, + "sampling/sampling_logp_difference/max": 5.328148365020752, + "sampling/sampling_logp_difference/mean": 0.02103862166404724, + "step": 238 + }, + { + "clip_ratio/high_max": 6.725130333506968e-06, + "clip_ratio/high_mean": 1.681282583376742e-06, + "clip_ratio/low_mean": 3.437372129155847e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.605500387493521e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15332.0, + "completions/mean_length": 5638.1328125, + "completions/mean_terminated_length": 5553.51953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.7844365313649178, + "epoch": 0.21987120515179392, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023868419229984283, + "learning_rate": 1e-05, + "loss": 0.0458, + "num_tokens": 189446294.0, + "reward": 0.515625, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000369548797607, + "sampling/importance_sampling_ratio/min": 0.0008047468145377934, + "sampling/sampling_logp_difference/max": 7.124982833862305, + "sampling/sampling_logp_difference/mean": 0.017401430755853653, + "step": 239 + }, + { + "clip_ratio/high_max": 2.887730215661577e-05, + "clip_ratio/high_mean": 7.219325539153942e-06, + "clip_ratio/low_mean": 2.826443028425274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.548375502759882e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16196.0, + "completions/mean_length": 6374.8046875, + "completions/mean_terminated_length": 6215.9287109375, + "completions/min_length": 722.0, + "completions/min_terminated_length": 722.0, + "entropy": 0.9472770467400551, + "epoch": 0.22079116835326587, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027549315709620714, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 190281461.0, + "reward": 0.3984375, + "reward_std": 0.3167053163051605, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998682737350464, + "sampling/importance_sampling_ratio/min": 7.100860239006579e-05, + "sampling/sampling_logp_difference/max": 9.552709579467773, + "sampling/sampling_logp_difference/mean": 0.020243138074874878, + "step": 240 + }, + { + "clip_ratio/high_max": 1.586787766427733e-05, + "clip_ratio/high_mean": 3.9669694160693325e-06, + "clip_ratio/low_mean": 2.978218674343225e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.374915604581474e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15097.0, + "completions/mean_length": 6654.21875, + "completions/mean_terminated_length": 6499.88134765625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 1.0028243213891983, + "epoch": 0.22171113155473782, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0013344973558560014, + "learning_rate": 1e-05, + "loss": 0.0184, + "num_tokens": 191156249.0, + "reward": 0.359375, + "reward_std": 0.22832971811294556, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 0.0021875568199902773, + "sampling/sampling_logp_difference/max": 6.124969959259033, + "sampling/sampling_logp_difference/mean": 0.020470600575208664, + "step": 241 + }, + { + "clip_ratio/high_max": 1.681529829511419e-05, + "clip_ratio/high_mean": 4.9954849146160996e-06, + "clip_ratio/low_mean": 2.040554932136729e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5401033553862362e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16172.0, + "completions/mean_length": 6767.7890625, + "completions/mean_terminated_length": 6537.00048828125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.9059296399354935, + "epoch": 0.22263109475620976, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016136945923790336, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 192040526.0, + "reward": 0.4921875, + "reward_std": 0.2909066081047058, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999668598175049, + "sampling/importance_sampling_ratio/min": 1.2452921509975567e-05, + "sampling/sampling_logp_difference/max": 11.29355525970459, + "sampling/sampling_logp_difference/mean": 0.020058143883943558, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9821966563758906e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9821966563758906e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16275.0, + "completions/max_terminated_length": 16275.0, + "completions/mean_length": 6767.4921875, + "completions/mean_terminated_length": 6767.4921875, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 1.0446822568774223, + "epoch": 0.22355105795768168, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002869367366656661, + "learning_rate": 1e-05, + "loss": 0.0212, + "num_tokens": 192926469.0, + "reward": 0.3828125, + "reward_std": 0.2517249882221222, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586343765259, + "sampling/importance_sampling_ratio/min": 1.9328599591972306e-05, + "sampling/sampling_logp_difference/max": 10.853924751281738, + "sampling/sampling_logp_difference/mean": 0.021512050181627274, + "step": 243 + }, + { + "clip_ratio/high_max": 3.44581130775623e-05, + "clip_ratio/high_mean": 1.3001711295146379e-05, + "clip_ratio/low_mean": 3.6407937841431703e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.940964981869911e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16261.0, + "completions/max_terminated_length": 16261.0, + "completions/mean_length": 5738.484375, + "completions/mean_terminated_length": 5738.484375, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.8617956340312958, + "epoch": 0.22447102115915363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002177527640014887, + "learning_rate": 1e-05, + "loss": -0.0189, + "num_tokens": 193678859.0, + "reward": 0.5546875, + "reward_std": 0.33220988512039185, + "rewards/accuracy_reward/mean": 0.5546875, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.0008533780346624553, + "sampling/sampling_logp_difference/max": 7.06630802154541, + "sampling/sampling_logp_difference/mean": 0.018141131848096848, + "step": 244 + }, + { + "clip_ratio/high_max": 3.861003733618418e-06, + "clip_ratio/high_mean": 9.652509334046044e-07, + "clip_ratio/low_mean": 2.7767115511778684e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8732366558870126e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15595.0, + "completions/mean_length": 6382.90625, + "completions/mean_terminated_length": 5976.357421875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.8692388981580734, + "epoch": 0.22539098436062557, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004127771593630314, + "learning_rate": 1e-05, + "loss": 0.0572, + "num_tokens": 194511847.0, + "reward": 0.4140625, + "reward_std": 0.2767002582550049, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 5.4239239943854045e-06, + "sampling/sampling_logp_difference/max": 12.124691009521484, + "sampling/sampling_logp_difference/mean": 0.018376430496573448, + "step": 245 + }, + { + "clip_ratio/high_max": 9.728395525598899e-06, + "clip_ratio/high_mean": 2.4320988813997246e-06, + "clip_ratio/low_mean": 5.3631663831765763e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.606376271316549e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14504.0, + "completions/max_terminated_length": 14504.0, + "completions/mean_length": 5776.15625, + "completions/mean_terminated_length": 5776.15625, + "completions/min_length": 1018.0, + "completions/min_terminated_length": 1018.0, + "entropy": 1.1195004731416702, + "epoch": 0.22631094756209752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00263008801266551, + "learning_rate": 1e-05, + "loss": 0.0687, + "num_tokens": 195270051.0, + "reward": 0.421875, + "reward_std": 0.3618982434272766, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999971866607666, + "sampling/importance_sampling_ratio/min": 0.005209421273320913, + "sampling/sampling_logp_difference/max": 5.257286548614502, + "sampling/sampling_logp_difference/mean": 0.019923292100429535, + "step": 246 + }, + { + "clip_ratio/high_max": 1.2701100786216557e-05, + "clip_ratio/high_mean": 3.1752751965541393e-06, + "clip_ratio/low_mean": 4.2162768181697174e-05, + "clip_ratio/low_min": 3.873926743835909e-06, + "clip_ratio/region_mean": 4.5338043378251314e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15203.0, + "completions/mean_length": 7411.421875, + "completions/mean_terminated_length": 7196.08056640625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.9801053553819656, + "epoch": 0.22723091076356947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002642859937623143, + "learning_rate": 1e-05, + "loss": 0.07, + "num_tokens": 196240913.0, + "reward": 0.390625, + "reward_std": 0.27328529953956604, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999198913574219, + "sampling/importance_sampling_ratio/min": 0.00017500204558018595, + "sampling/sampling_logp_difference/max": 8.650712966918945, + "sampling/sampling_logp_difference/mean": 0.021511007100343704, + "step": 247 + }, + { + "clip_ratio/high_max": 1.5122936929401476e-05, + "clip_ratio/high_mean": 3.780734232350369e-06, + "clip_ratio/low_mean": 6.367217611114029e-05, + "clip_ratio/low_min": 4.8010447244450916e-06, + "clip_ratio/region_mean": 6.745291057086433e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16127.0, + "completions/mean_length": 7944.65625, + "completions/mean_terminated_length": 7742.1123046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 1.0132562816143036, + "epoch": 0.2281508739650414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002439325675368309, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 197278517.0, + "reward": 0.34375, + "reward_std": 0.3161812424659729, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999248385429382, + "sampling/importance_sampling_ratio/min": 1.0140610356756952e-05, + "sampling/sampling_logp_difference/max": 11.49896240234375, + "sampling/sampling_logp_difference/mean": 0.02124868705868721, + "step": 248 + }, + { + "clip_ratio/high_max": 2.6017536356448545e-05, + "clip_ratio/high_mean": 6.504384089112136e-06, + "clip_ratio/low_mean": 3.7791321346958284e-05, + "clip_ratio/low_min": 3.2110563097376144e-06, + "clip_ratio/region_mean": 4.429570503816649e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 7550.0, + "completions/mean_terminated_length": 7409.7783203125, + "completions/min_length": 1469.0, + "completions/min_terminated_length": 1469.0, + "entropy": 1.0384011715650558, + "epoch": 0.22907083716651333, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014879995724186301, + "learning_rate": 1e-05, + "loss": 0.0338, + "num_tokens": 198265589.0, + "reward": 0.3359375, + "reward_std": 0.24040167033672333, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999468922615051, + "sampling/importance_sampling_ratio/min": 8.418659126618877e-05, + "sampling/sampling_logp_difference/max": 9.382474899291992, + "sampling/sampling_logp_difference/mean": 0.021503347903490067, + "step": 249 + }, + { + "clip_ratio/high_max": 1.3615457191917812e-05, + "clip_ratio/high_mean": 4.491880531531933e-06, + "clip_ratio/low_mean": 3.916533574965797e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.365721684962409e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16221.0, + "completions/mean_length": 8140.9140625, + "completions/mean_terminated_length": 7517.48779296875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.8718572407960892, + "epoch": 0.22999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002340668346732855, + "learning_rate": 1e-05, + "loss": 0.0585, + "num_tokens": 199324938.0, + "reward": 0.453125, + "reward_std": 0.35824596881866455, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.002325017238035798, + "sampling/sampling_logp_difference/max": 6.064027786254883, + "sampling/sampling_logp_difference/mean": 0.019466478377580643, + "step": 250 + }, + { + "clip_ratio/high_max": 2.2175697040438536e-05, + "clip_ratio/high_mean": 5.543924260109634e-06, + "clip_ratio/low_mean": 4.1318608055007644e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.686253225827386e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16263.0, + "completions/mean_length": 6630.96875, + "completions/mean_terminated_length": 6396.896484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.7798146530985832, + "epoch": 0.23091076356945722, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001989356242120266, + "learning_rate": 1e-05, + "loss": 0.0218, + "num_tokens": 200189902.0, + "reward": 0.5625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999474883079529, + "sampling/importance_sampling_ratio/min": 0.0003315774374641478, + "sampling/sampling_logp_difference/max": 8.011649131774902, + "sampling/sampling_logp_difference/mean": 0.01849902793765068, + "step": 251 + }, + { + "clip_ratio/high_max": 3.325706302348408e-06, + "clip_ratio/high_mean": 8.31426575587102e-07, + "clip_ratio/low_mean": 2.0285911205064622e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.111733795118198e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15357.0, + "completions/max_terminated_length": 15357.0, + "completions/mean_length": 6582.203125, + "completions/mean_terminated_length": 6582.203125, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 1.0181676000356674, + "epoch": 0.23183072677092917, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002594445599243045, + "learning_rate": 1e-05, + "loss": 0.0232, + "num_tokens": 201052832.0, + "reward": 0.34375, + "reward_std": 0.25460314750671387, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999495148658752, + "sampling/importance_sampling_ratio/min": 0.0003853558446280658, + "sampling/sampling_logp_difference/max": 7.8613433837890625, + "sampling/sampling_logp_difference/mean": 0.021598614752292633, + "step": 252 + }, + { + "clip_ratio/high_max": 2.2044430352252675e-05, + "clip_ratio/high_mean": 5.511107588063169e-06, + "clip_ratio/low_mean": 3.4155824209847196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.96669319115972e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14540.0, + "completions/max_terminated_length": 14540.0, + "completions/mean_length": 6145.1796875, + "completions/mean_terminated_length": 6145.1796875, + "completions/min_length": 1098.0, + "completions/min_terminated_length": 1098.0, + "entropy": 0.9084350541234016, + "epoch": 0.23275068997240111, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003104996867477894, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 201858047.0, + "reward": 0.5078125, + "reward_std": 0.33220985531806946, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000011682510376, + "sampling/importance_sampling_ratio/min": 0.007650630082935095, + "sampling/sampling_logp_difference/max": 4.87296724319458, + "sampling/sampling_logp_difference/mean": 0.018979094922542572, + "step": 253 + }, + { + "clip_ratio/high_max": 2.9959978519400465e-05, + "clip_ratio/high_mean": 7.489994629850116e-06, + "clip_ratio/low_mean": 3.5255963325653283e-05, + "clip_ratio/low_min": 2.973075879708631e-06, + "clip_ratio/region_mean": 4.274595892184152e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15745.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7259.953125, + "completions/mean_terminated_length": 7259.953125, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 0.9823614731431007, + "epoch": 0.23367065317387303, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003212577663362026, + "learning_rate": 1e-05, + "loss": 0.0133, + "num_tokens": 202807673.0, + "reward": 0.4765625, + "reward_std": 0.3056321144104004, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999860405921936, + "sampling/importance_sampling_ratio/min": 0.000536504783667624, + "sampling/sampling_logp_difference/max": 7.530435085296631, + "sampling/sampling_logp_difference/mean": 0.021432969719171524, + "step": 254 + }, + { + "clip_ratio/high_max": 3.273996276220714e-05, + "clip_ratio/high_mean": 9.095591565255745e-06, + "clip_ratio/low_mean": 2.9539680099333054e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8635271948805894e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7258.71875, + "completions/mean_terminated_length": 7113.87353515625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.8823810070753098, + "epoch": 0.23459061637534498, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001418307889252901, + "learning_rate": 1e-05, + "loss": 0.0411, + "num_tokens": 203757333.0, + "reward": 0.40625, + "reward_std": 0.3048579692840576, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884963035583, + "sampling/importance_sampling_ratio/min": 0.0006408974295482039, + "sampling/sampling_logp_difference/max": 7.3526411056518555, + "sampling/sampling_logp_difference/mean": 0.019296500831842422, + "step": 255 + }, + { + "clip_ratio/high_max": 1.544119368190877e-05, + "clip_ratio/high_mean": 3.860298420477193e-06, + "clip_ratio/low_mean": 3.755458698151415e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.141488631148604e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 7011.40625, + "completions/mean_terminated_length": 6386.56689453125, + "completions/min_length": 685.0, + "completions/min_terminated_length": 685.0, + "entropy": 0.8057166337966919, + "epoch": 0.23551057957681693, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001652427832596004, + "learning_rate": 1e-05, + "loss": 0.0459, + "num_tokens": 204675065.0, + "reward": 0.46875, + "reward_std": 0.24146251380443573, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999918937683105, + "sampling/importance_sampling_ratio/min": 0.015319154597818851, + "sampling/sampling_logp_difference/max": 4.178651332855225, + "sampling/sampling_logp_difference/mean": 0.018787402659654617, + "step": 256 + }, + { + "clip_ratio/high_max": 5.222041181696113e-06, + "clip_ratio/high_mean": 2.209917965956265e-06, + "clip_ratio/low_mean": 4.0701652551433654e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.291157006264257e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14796.0, + "completions/max_terminated_length": 14796.0, + "completions/mean_length": 6243.4296875, + "completions/mean_terminated_length": 6243.4296875, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "entropy": 0.9856048971414566, + "epoch": 0.23643054277828887, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001482579973526299, + "learning_rate": 1e-05, + "loss": 0.0677, + "num_tokens": 205494344.0, + "reward": 0.5390625, + "reward_std": 0.28930407762527466, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998942613601685, + "sampling/importance_sampling_ratio/min": 0.0004254466330166906, + "sampling/sampling_logp_difference/max": 7.762371063232422, + "sampling/sampling_logp_difference/mean": 0.019727632403373718, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 6.842733455414418e-05, + "clip_ratio/low_min": 9.297655878981459e-06, + "clip_ratio/region_mean": 6.842733455414418e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 7122.2421875, + "completions/mean_terminated_length": 6586.4375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.8625433370471001, + "epoch": 0.23735050597976082, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002006452763453126, + "learning_rate": 1e-05, + "loss": 0.0312, + "num_tokens": 206428775.0, + "reward": 0.40625, + "reward_std": 0.2987973093986511, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999338388442993, + "sampling/importance_sampling_ratio/min": 0.00010911409481195733, + "sampling/sampling_logp_difference/max": 9.123116493225098, + "sampling/sampling_logp_difference/mean": 0.01927522011101246, + "step": 258 + }, + { + "clip_ratio/high_max": 2.887607206503162e-05, + "clip_ratio/high_mean": 7.219018016257905e-06, + "clip_ratio/low_mean": 2.7790995090981596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.501001378936053e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15254.0, + "completions/mean_length": 7965.2734375, + "completions/mean_terminated_length": 7623.6826171875, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "entropy": 1.0068430602550507, + "epoch": 0.23827046918123276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0029176415409892797, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 207469586.0, + "reward": 0.3828125, + "reward_std": 0.2212003916501999, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998945593833923, + "sampling/importance_sampling_ratio/min": 4.06005028708023e-06, + "sampling/sampling_logp_difference/max": 12.414315223693848, + "sampling/sampling_logp_difference/mean": 0.02198987640440464, + "step": 259 + }, + { + "clip_ratio/high_max": 8.710998599781306e-06, + "clip_ratio/high_mean": 2.1777496499453264e-06, + "clip_ratio/low_mean": 4.1899779091636447e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407752874158177e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15331.0, + "completions/mean_length": 6329.4296875, + "completions/mean_terminated_length": 6169.83349609375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.9399363100528717, + "epoch": 0.23919043238270468, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0019115234026685357, + "learning_rate": 1e-05, + "loss": 0.0399, + "num_tokens": 208300217.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000567436218262, + "sampling/importance_sampling_ratio/min": 2.1449603082146496e-05, + "sampling/sampling_logp_difference/max": 10.749804496765137, + "sampling/sampling_logp_difference/mean": 0.020002204924821854, + "step": 260 + }, + { + "clip_ratio/high_max": 2.536784450057894e-05, + "clip_ratio/high_mean": 6.341961125144735e-06, + "clip_ratio/low_mean": 5.959111433639919e-05, + "clip_ratio/low_min": 1.1521060741870315e-05, + "clip_ratio/region_mean": 6.593307591629127e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15635.0, + "completions/mean_length": 6747.90625, + "completions/mean_terminated_length": 6594.95263671875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.9575144425034523, + "epoch": 0.24011039558417663, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003766207257285714, + "learning_rate": 1e-05, + "loss": 0.0667, + "num_tokens": 209181077.0, + "reward": 0.4375, + "reward_std": 0.3164137303829193, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999313354492188, + "sampling/importance_sampling_ratio/min": 1.250743298442103e-05, + "sampling/sampling_logp_difference/max": 11.28918743133545, + "sampling/sampling_logp_difference/mean": 0.020067427307367325, + "step": 261 + }, + { + "clip_ratio/high_max": 2.0626074274332495e-05, + "clip_ratio/high_mean": 5.156518568583124e-06, + "clip_ratio/low_mean": 5.808068385704246e-05, + "clip_ratio/low_min": 1.0360539818066172e-05, + "clip_ratio/region_mean": 6.32372018571914e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16218.0, + "completions/mean_length": 6426.6953125, + "completions/mean_terminated_length": 6348.29150390625, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.87480478733778, + "epoch": 0.24103035878564857, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.002375675830990076, + "learning_rate": 1e-05, + "loss": 0.0752, + "num_tokens": 210023702.0, + "reward": 0.5078125, + "reward_std": 0.38900789618492126, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999383687973022, + "sampling/importance_sampling_ratio/min": 0.00024259372730739415, + "sampling/sampling_logp_difference/max": 8.324122428894043, + "sampling/sampling_logp_difference/mean": 0.018864646553993225, + "step": 262 + }, + { + "clip_ratio/high_max": 4.462851393327583e-06, + "clip_ratio/high_mean": 1.1157128483318957e-06, + "clip_ratio/low_mean": 3.8966268334661436e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.008198141036701e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16022.0, + "completions/mean_length": 7223.1484375, + "completions/mean_terminated_length": 6927.63671875, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 1.0218688547611237, + "epoch": 0.24195032198712052, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016257674433290958, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 210969921.0, + "reward": 0.4609375, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 9.193710138788447e-05, + "sampling/sampling_logp_difference/max": 9.294405937194824, + "sampling/sampling_logp_difference/mean": 0.02119653858244419, + "step": 263 + }, + { + "clip_ratio/high_max": 1.2653464409595472e-05, + "clip_ratio/high_mean": 3.163366102398868e-06, + "clip_ratio/low_mean": 4.864477250521304e-05, + "clip_ratio/low_min": 8.641252861707471e-06, + "clip_ratio/region_mean": 5.1808138323394815e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15180.0, + "completions/max_terminated_length": 15180.0, + "completions/mean_length": 6974.0703125, + "completions/mean_terminated_length": 6974.0703125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9265539348125458, + "epoch": 0.24287028518859247, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023448490537703037, + "learning_rate": 1e-05, + "loss": 0.0567, + "num_tokens": 211884866.0, + "reward": 0.390625, + "reward_std": 0.2885475754737854, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.0007677432149648666, + "sampling/sampling_logp_difference/max": 7.172055244445801, + "sampling/sampling_logp_difference/mean": 0.020384611561894417, + "step": 264 + }, + { + "clip_ratio/high_max": 1.1967917316724197e-05, + "clip_ratio/high_mean": 2.9919793291810493e-06, + "clip_ratio/low_mean": 3.179497366545547e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.478695157355105e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15657.0, + "completions/mean_length": 7247.2734375, + "completions/mean_terminated_length": 7027.9921875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.9756898358464241, + "epoch": 0.24379024839006438, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003212807234376669, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 212833933.0, + "reward": 0.328125, + "reward_std": 0.2398776412010193, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.001600456889718771, + "sampling/sampling_logp_difference/max": 6.437466144561768, + "sampling/sampling_logp_difference/mean": 0.0199666079133749, + "step": 265 + }, + { + "clip_ratio/high_max": 1.1404694760130951e-05, + "clip_ratio/high_mean": 3.887520392709121e-06, + "clip_ratio/low_mean": 4.0242122167910566e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4129643583801226e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15575.0, + "completions/mean_length": 7773.9296875, + "completions/mean_terminated_length": 7423.9267578125, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "entropy": 0.9765531942248344, + "epoch": 0.24471021159153633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019600428640842438, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 213848508.0, + "reward": 0.3984375, + "reward_std": 0.3129909336566925, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 2.430168751743622e-05, + "sampling/sampling_logp_difference/max": 10.624964714050293, + "sampling/sampling_logp_difference/mean": 0.020565161481499672, + "step": 266 + }, + { + "clip_ratio/high_max": 6.725708999510971e-06, + "clip_ratio/high_mean": 1.6814272498777427e-06, + "clip_ratio/low_mean": 2.869901106805628e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0380438261090603e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15583.0, + "completions/mean_length": 6722.5, + "completions/mean_terminated_length": 6569.14306640625, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "entropy": 0.9291529878973961, + "epoch": 0.24563017479300828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014550165506079793, + "learning_rate": 1e-05, + "loss": 0.0235, + "num_tokens": 214731180.0, + "reward": 0.4921875, + "reward_std": 0.19332444667816162, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999178647994995, + "sampling/importance_sampling_ratio/min": 0.007400285452604294, + "sampling/sampling_logp_difference/max": 4.90623664855957, + "sampling/sampling_logp_difference/mean": 0.020057080313563347, + "step": 267 + }, + { + "clip_ratio/high_max": 1.8797170469042612e-05, + "clip_ratio/high_mean": 6.827749643889547e-06, + "clip_ratio/low_mean": 3.448591337473772e-05, + "clip_ratio/low_min": 4.687090040533803e-06, + "clip_ratio/region_mean": 4.1313662677566754e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15797.0, + "completions/max_terminated_length": 15797.0, + "completions/mean_length": 7001.8671875, + "completions/mean_terminated_length": 7001.8671875, + "completions/min_length": 930.0, + "completions/min_terminated_length": 930.0, + "entropy": 1.0746883526444435, + "epoch": 0.24655013799448022, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002483292715623975, + "learning_rate": 1e-05, + "loss": 0.048, + "num_tokens": 215645819.0, + "reward": 0.3515625, + "reward_std": 0.32955142855644226, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 1.0195622053288389e-05, + "sampling/sampling_logp_difference/max": 11.493552207946777, + "sampling/sampling_logp_difference/mean": 0.020808640867471695, + "step": 268 + }, + { + "clip_ratio/high_max": 8.748068921704544e-06, + "clip_ratio/high_mean": 2.187017230426136e-06, + "clip_ratio/low_mean": 8.762007928453386e-05, + "clip_ratio/low_min": 2.3698836685071e-05, + "clip_ratio/region_mean": 8.980709480965743e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14189.0, + "completions/mean_length": 6663.796875, + "completions/mean_terminated_length": 6509.50830078125, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 1148.0, + "entropy": 1.0000900849699974, + "epoch": 0.24747010119595217, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0015696679474785924, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 216519369.0, + "reward": 0.3671875, + "reward_std": 0.3214311897754669, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997707605361938, + "sampling/importance_sampling_ratio/min": 1.288027192458685e-06, + "sampling/sampling_logp_difference/max": 13.562398910522461, + "sampling/sampling_logp_difference/mean": 0.022182684391736984, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.567897690321843e-05, + "clip_ratio/low_min": 3.287224444648018e-06, + "clip_ratio/region_mean": 4.567897690321843e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 6978.7421875, + "completions/mean_terminated_length": 6829.45263671875, + "completions/min_length": 1661.0, + "completions/min_terminated_length": 1661.0, + "entropy": 1.0845019966363907, + "epoch": 0.24839006439742412, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003887100610882044, + "learning_rate": 1e-05, + "loss": 0.1076, + "num_tokens": 217432432.0, + "reward": 0.3671875, + "reward_std": 0.3124619722366333, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999902248382568, + "sampling/importance_sampling_ratio/min": 0.02168075367808342, + "sampling/sampling_logp_difference/max": 3.8313302993774414, + "sampling/sampling_logp_difference/mean": 0.02127157337963581, + "step": 270 + }, + { + "clip_ratio/high_max": 2.444328310957644e-05, + "clip_ratio/high_mean": 6.11082077739411e-06, + "clip_ratio/low_mean": 5.1527222922231886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.7638043699625996e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15638.0, + "completions/mean_length": 5903.5546875, + "completions/mean_terminated_length": 5652.0244140625, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.8638224303722382, + "epoch": 0.24931002759889603, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002851828932762146, + "learning_rate": 1e-05, + "loss": 0.0771, + "num_tokens": 218208399.0, + "reward": 0.4453125, + "reward_std": 0.3713914752006531, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000553131103516, + "sampling/importance_sampling_ratio/min": 0.000626727007329464, + "sampling/sampling_logp_difference/max": 7.374999523162842, + "sampling/sampling_logp_difference/mean": 0.01880766451358795, + "step": 271 + }, + { + "clip_ratio/high_max": 8.474872856822913e-06, + "clip_ratio/high_mean": 2.118718214205728e-06, + "clip_ratio/low_mean": 2.5821682072546537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.794040096887329e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16060.0, + "completions/max_terminated_length": 16060.0, + "completions/mean_length": 5596.7109375, + "completions/mean_terminated_length": 5596.7109375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 1.1127397641539574, + "epoch": 0.250229990800368, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018005800666287541, + "learning_rate": 1e-05, + "loss": 0.0075, + "num_tokens": 218944418.0, + "reward": 0.4375, + "reward_std": 0.29485049843788147, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000048875808716, + "sampling/importance_sampling_ratio/min": 0.01548748929053545, + "sampling/sampling_logp_difference/max": 4.167722702026367, + "sampling/sampling_logp_difference/mean": 0.02004322223365307, + "step": 272 + }, + { + "clip_ratio/high_max": 1.5034628631838132e-05, + "clip_ratio/high_mean": 4.925485768580984e-06, + "clip_ratio/low_mean": 3.539464648838475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.032013237065257e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16362.0, + "completions/mean_length": 7120.109375, + "completions/mean_terminated_length": 7047.16552734375, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "entropy": 1.0697019025683403, + "epoch": 0.2511499540018399, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022711476776748896, + "learning_rate": 1e-05, + "loss": 0.0126, + "num_tokens": 219875952.0, + "reward": 0.2734375, + "reward_std": 0.23751862347126007, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000036358833313, + "sampling/importance_sampling_ratio/min": 9.733050683280453e-05, + "sampling/sampling_logp_difference/max": 9.237398147583008, + "sampling/sampling_logp_difference/mean": 0.02110595628619194, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0558468147792155e-05, + "clip_ratio/high_mean": 2.6396170369480387e-06, + "clip_ratio/low_mean": 3.796903268948881e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.060864915800266e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15745.0, + "completions/mean_length": 7623.953125, + "completions/mean_terminated_length": 7484.9052734375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.8836525157094002, + "epoch": 0.25206991720331184, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002609838731586933, + "learning_rate": 1e-05, + "loss": 0.0563, + "num_tokens": 220871730.0, + "reward": 0.3046875, + "reward_std": 0.30061954259872437, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999877214431763, + "sampling/importance_sampling_ratio/min": 0.0015448236372321844, + "sampling/sampling_logp_difference/max": 6.472845554351807, + "sampling/sampling_logp_difference/mean": 0.019322458654642105, + "step": 274 + }, + { + "clip_ratio/high_max": 1.144785210271948e-05, + "clip_ratio/high_mean": 2.86196302567987e-06, + "clip_ratio/low_mean": 5.795533934360719e-05, + "clip_ratio/low_min": 4.49300887339632e-06, + "clip_ratio/region_mean": 6.081730361984228e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15557.0, + "completions/mean_length": 6778.71875, + "completions/mean_terminated_length": 6703.08642578125, + "completions/min_length": 1187.0, + "completions/min_terminated_length": 1187.0, + "entropy": 0.8968989998102188, + "epoch": 0.2529898804047838, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.00395589042454958, + "learning_rate": 1e-05, + "loss": 0.0538, + "num_tokens": 221761214.0, + "reward": 0.4921875, + "reward_std": 0.4032142758369446, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000214576721191, + "sampling/importance_sampling_ratio/min": 0.0011724763317033648, + "sampling/sampling_logp_difference/max": 6.7486371994018555, + "sampling/sampling_logp_difference/mean": 0.018937086686491966, + "step": 275 + }, + { + "clip_ratio/high_max": 2.708495139813749e-05, + "clip_ratio/high_mean": 7.628764933542698e-06, + "clip_ratio/low_mean": 3.0297362627607072e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.792612744746293e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7319.2578125, + "completions/mean_terminated_length": 6794.85107421875, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.870811752974987, + "epoch": 0.25390984360625574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002063714899122715, + "learning_rate": 1e-05, + "loss": 0.0271, + "num_tokens": 222719287.0, + "reward": 0.3203125, + "reward_std": 0.2835301160812378, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999525547027588, + "sampling/importance_sampling_ratio/min": 2.13631665246794e-05, + "sampling/sampling_logp_difference/max": 10.7538423538208, + "sampling/sampling_logp_difference/mean": 0.019336167722940445, + "step": 276 + }, + { + "clip_ratio/high_max": 3.860288416035473e-06, + "clip_ratio/high_mean": 9.650721040088683e-07, + "clip_ratio/low_mean": 2.303871349340625e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4003785597415117e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6207.4140625, + "completions/mean_terminated_length": 5879.13671875, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.8348869979381561, + "epoch": 0.2548298068077277, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0023463829420506954, + "learning_rate": 1e-05, + "loss": 0.0696, + "num_tokens": 223533372.0, + "reward": 0.4375, + "reward_std": 0.2359210103750229, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 2.1447433027788065e-05, + "sampling/sampling_logp_difference/max": 10.749905586242676, + "sampling/sampling_logp_difference/mean": 0.018392907455563545, + "step": 277 + }, + { + "clip_ratio/high_max": 2.1441665467136772e-05, + "clip_ratio/high_mean": 5.360416366784193e-06, + "clip_ratio/low_mean": 5.504566888703266e-05, + "clip_ratio/low_min": 1.2581466762640048e-05, + "clip_ratio/region_mean": 6.040608514013002e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14713.0, + "completions/max_terminated_length": 14713.0, + "completions/mean_length": 6417.2109375, + "completions/mean_terminated_length": 6417.2109375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "entropy": 1.0232173576951027, + "epoch": 0.25574977000919963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0033652919810265303, + "learning_rate": 1e-05, + "loss": 0.034, + "num_tokens": 224375711.0, + "reward": 0.390625, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999820590019226, + "sampling/importance_sampling_ratio/min": 0.0020559614058583975, + "sampling/sampling_logp_difference/max": 6.18701171875, + "sampling/sampling_logp_difference/mean": 0.020980924367904663, + "step": 278 + }, + { + "clip_ratio/high_max": 4.679544872487895e-06, + "clip_ratio/high_mean": 1.1698862181219738e-06, + "clip_ratio/low_mean": 2.818696702888701e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9356853247008985e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15026.0, + "completions/max_terminated_length": 15026.0, + "completions/mean_length": 5275.9453125, + "completions/mean_terminated_length": 5275.9453125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.8563915193080902, + "epoch": 0.25666973321067155, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0025473968125879765, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 225070992.0, + "reward": 0.703125, + "reward_std": 0.2790592610836029, + "rewards/accuracy_reward/mean": 0.703125, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.0010016229934990406, + "sampling/sampling_logp_difference/max": 6.906133651733398, + "sampling/sampling_logp_difference/mean": 0.018068701028823853, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1973035422415705e-05, + "clip_ratio/low_min": 6.267234766710317e-06, + "clip_ratio/region_mean": 4.1973035422415705e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16350.0, + "completions/mean_length": 7693.984375, + "completions/mean_terminated_length": 7556.0478515625, + "completions/min_length": 1349.0, + "completions/min_terminated_length": 1349.0, + "entropy": 0.7832933664321899, + "epoch": 0.2575896964121435, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0016663498245179653, + "learning_rate": 1e-05, + "loss": 0.0836, + "num_tokens": 226073822.0, + "reward": 0.421875, + "reward_std": 0.3227166533470154, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999273419380188, + "sampling/importance_sampling_ratio/min": 5.893720299354754e-06, + "sampling/sampling_logp_difference/max": 12.04162311553955, + "sampling/sampling_logp_difference/mean": 0.01851016655564308, + "step": 280 + }, + { + "clip_ratio/high_max": 1.304801662627142e-05, + "clip_ratio/high_mean": 3.262004156567855e-06, + "clip_ratio/low_mean": 3.7096169648975774e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.035817426029098e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15207.0, + "completions/mean_length": 6222.4609375, + "completions/mean_terminated_length": 6061.1669921875, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8835120126605034, + "epoch": 0.25850965961361544, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0021045261528342962, + "learning_rate": 1e-05, + "loss": 0.055, + "num_tokens": 226888577.0, + "reward": 0.5078125, + "reward_std": 0.2767002284526825, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999616742134094, + "sampling/importance_sampling_ratio/min": 5.688065698450373e-07, + "sampling/sampling_logp_difference/max": 14.379725456237793, + "sampling/sampling_logp_difference/mean": 0.018851105123758316, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.1754828114571865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1754828114571865e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6371.625, + "completions/mean_terminated_length": 6131.328125, + "completions/min_length": 1034.0, + "completions/min_terminated_length": 1034.0, + "entropy": 0.9026313945651054, + "epoch": 0.2594296228150874, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030448357574641705, + "learning_rate": 1e-05, + "loss": 0.1009, + "num_tokens": 227722025.0, + "reward": 0.515625, + "reward_std": 0.2722293734550476, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999712705612183, + "sampling/importance_sampling_ratio/min": 0.00016869648243300617, + "sampling/sampling_logp_difference/max": 8.687409400939941, + "sampling/sampling_logp_difference/mean": 0.018757576122879982, + "step": 282 + }, + { + "clip_ratio/high_max": 7.024085562079563e-06, + "clip_ratio/high_mean": 1.7560213905198907e-06, + "clip_ratio/low_mean": 3.379111592494155e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5547137599678535e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15891.0, + "completions/mean_length": 7510.4921875, + "completions/mean_terminated_length": 7224.25, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 1.044313833117485, + "epoch": 0.26034958601655933, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019295766251161695, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 228703256.0, + "reward": 0.3046875, + "reward_std": 0.19674429297447205, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.0002186153142247349, + "sampling/sampling_logp_difference/max": 8.428196907043457, + "sampling/sampling_logp_difference/mean": 0.02207346074283123, + "step": 283 + }, + { + "clip_ratio/high_max": 5.068321115686558e-06, + "clip_ratio/high_mean": 1.2670802789216395e-06, + "clip_ratio/low_mean": 3.7797102550030104e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9064182828951743e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 7594.140625, + "completions/mean_terminated_length": 7524.92919921875, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.9706612005829811, + "epoch": 0.2612695492180313, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0017117204843088984, + "learning_rate": 1e-05, + "loss": 0.0748, + "num_tokens": 229697002.0, + "reward": 0.2734375, + "reward_std": 0.18649455904960632, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016212463379, + "sampling/importance_sampling_ratio/min": 0.00035400164779275656, + "sampling/sampling_logp_difference/max": 7.946208953857422, + "sampling/sampling_logp_difference/mean": 0.021097885444760323, + "step": 284 + }, + { + "clip_ratio/high_max": 1.5618601537426002e-05, + "clip_ratio/high_mean": 3.904650384356501e-06, + "clip_ratio/low_mean": 4.570582996166195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.961048034601845e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15562.0, + "completions/mean_length": 6888.9140625, + "completions/mean_terminated_length": 6738.19873046875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.9210037142038345, + "epoch": 0.2621895124195032, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025933689903467894, + "learning_rate": 1e-05, + "loss": 0.0887, + "num_tokens": 230598679.0, + "reward": 0.4375, + "reward_std": 0.2580229938030243, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999586939811707, + "sampling/importance_sampling_ratio/min": 0.0007308972999453545, + "sampling/sampling_logp_difference/max": 7.221237659454346, + "sampling/sampling_logp_difference/mean": 0.01939917542040348, + "step": 285 + }, + { + "clip_ratio/high_max": 2.398964193162101e-05, + "clip_ratio/high_mean": 6.9283565835576155e-06, + "clip_ratio/low_mean": 4.821338916372042e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.514174608833855e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 6433.640625, + "completions/mean_terminated_length": 6355.29150390625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 1.064419962465763, + "epoch": 0.26310947562097514, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0019397985888645053, + "learning_rate": 1e-05, + "loss": 0.0841, + "num_tokens": 231440153.0, + "reward": 0.375, + "reward_std": 0.3451131582260132, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999503493309021, + "sampling/importance_sampling_ratio/min": 0.019039930775761604, + "sampling/sampling_logp_difference/max": 3.961216926574707, + "sampling/sampling_logp_difference/mean": 0.021084938198328018, + "step": 286 + }, + { + "clip_ratio/high_max": 1.9223051822336856e-05, + "clip_ratio/high_mean": 6.997284344834043e-06, + "clip_ratio/low_mean": 5.4512621773028513e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.150990611786256e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14304.0, + "completions/mean_length": 5858.40625, + "completions/mean_terminated_length": 5691.33349609375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.8120778575539589, + "epoch": 0.2640294388224471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002288782736286521, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 232209485.0, + "reward": 0.46875, + "reward_std": 0.36637401580810547, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 0.00017959839897230268, + "sampling/sampling_logp_difference/max": 8.624787330627441, + "sampling/sampling_logp_difference/mean": 0.019076552242040634, + "step": 287 + }, + { + "clip_ratio/high_max": 9.900939403451048e-06, + "clip_ratio/high_mean": 3.4680233511608094e-06, + "clip_ratio/low_mean": 1.8137742017643177e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1605765368803986e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15825.0, + "completions/mean_length": 7088.4765625, + "completions/mean_terminated_length": 6710.609375, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9231890514492989, + "epoch": 0.26494940202391903, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.001075367210432887, + "learning_rate": 1e-05, + "loss": 0.0364, + "num_tokens": 233133850.0, + "reward": 0.5078125, + "reward_std": 0.18383610248565674, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998681545257568, + "sampling/importance_sampling_ratio/min": 0.005257915705442429, + "sampling/sampling_logp_difference/max": 5.248020648956299, + "sampling/sampling_logp_difference/mean": 0.019140273332595825, + "step": 288 + }, + { + "clip_ratio/high_max": 8.648456969240215e-06, + "clip_ratio/high_mean": 2.1621142423100537e-06, + "clip_ratio/low_mean": 1.838804723774956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0550161480059614e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16384.0, + "completions/mean_length": 6151.78125, + "completions/mean_terminated_length": 5906.20849609375, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.8585417941212654, + "epoch": 0.265869365225391, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0015517349820584059, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 233940718.0, + "reward": 0.46875, + "reward_std": 0.21884137392044067, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000255107879639, + "sampling/importance_sampling_ratio/min": 7.617311348440126e-05, + "sampling/sampling_logp_difference/max": 9.482501983642578, + "sampling/sampling_logp_difference/mean": 0.019276250153779984, + "step": 289 + }, + { + "clip_ratio/high_max": 1.1416668485253467e-05, + "clip_ratio/high_mean": 3.7661499732166703e-06, + "clip_ratio/low_mean": 2.1342358195397537e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5108507770710276e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15133.0, + "completions/mean_length": 7111.2578125, + "completions/mean_terminated_length": 6812.13671875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.9735362678766251, + "epoch": 0.2667893284268629, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0036829947493970394, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 234872111.0, + "reward": 0.4296875, + "reward_std": 0.31930169463157654, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999943971633911, + "sampling/importance_sampling_ratio/min": 0.0006535807042382658, + "sampling/sampling_logp_difference/max": 7.333044528961182, + "sampling/sampling_logp_difference/mean": 0.021356046199798584, + "step": 290 + }, + { + "clip_ratio/high_max": 2.2526005068357335e-05, + "clip_ratio/high_mean": 5.631501267089334e-06, + "clip_ratio/low_mean": 3.30086276107977e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.864012808207917e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15995.0, + "completions/mean_length": 6787.671875, + "completions/mean_terminated_length": 6478.11279296875, + "completions/min_length": 1404.0, + "completions/min_terminated_length": 1404.0, + "entropy": 0.8856986835598946, + "epoch": 0.26770929162833484, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00234629912301898, + "learning_rate": 1e-05, + "loss": 0.0169, + "num_tokens": 235759149.0, + "reward": 0.5390625, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999296069145203, + "sampling/importance_sampling_ratio/min": 0.00035710117663256824, + "sampling/sampling_logp_difference/max": 7.937491416931152, + "sampling/sampling_logp_difference/mean": 0.01950475014746189, + "step": 291 + }, + { + "clip_ratio/high_max": 2.6025282068076194e-05, + "clip_ratio/high_mean": 6.5063205170190486e-06, + "clip_ratio/low_mean": 4.603358706845029e-05, + "clip_ratio/low_min": 4.53654638477019e-06, + "clip_ratio/region_mean": 5.253990843812062e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15845.0, + "completions/mean_length": 6757.203125, + "completions/mean_terminated_length": 6604.39697265625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.9217840805649757, + "epoch": 0.2686292548298068, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0034125701058655977, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 236643319.0, + "reward": 0.3515625, + "reward_std": 0.2896084189414978, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999240636825562, + "sampling/importance_sampling_ratio/min": 6.144329745438881e-06, + "sampling/sampling_logp_difference/max": 11.999980926513672, + "sampling/sampling_logp_difference/mean": 0.020774487406015396, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.5210429246035346e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5210429246035346e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16319.0, + "completions/mean_length": 6504.4375, + "completions/mean_terminated_length": 6185.74169921875, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 1.126970261335373, + "epoch": 0.26954921803127874, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0020905097480863333, + "learning_rate": 1e-05, + "loss": 0.0464, + "num_tokens": 237495351.0, + "reward": 0.25, + "reward_std": 0.30904704332351685, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000053644180298, + "sampling/importance_sampling_ratio/min": 0.0009940610034391284, + "sampling/sampling_logp_difference/max": 6.913712024688721, + "sampling/sampling_logp_difference/mean": 0.023218728601932526, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.5693222053414502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5693222053414502e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15888.0, + "completions/mean_length": 5702.4140625, + "completions/mean_terminated_length": 5446.05615234375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.8803137242794037, + "epoch": 0.2704691812327507, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.002073790645226836, + "learning_rate": 1e-05, + "loss": 0.0066, + "num_tokens": 238251852.0, + "reward": 0.5625, + "reward_std": 0.2022808939218521, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000054955482483, + "sampling/importance_sampling_ratio/min": 0.016290459781885147, + "sampling/sampling_logp_difference/max": 4.117175579071045, + "sampling/sampling_logp_difference/mean": 0.0185186006128788, + "step": 294 + }, + { + "clip_ratio/high_max": 1.4213665508577833e-05, + "clip_ratio/high_mean": 4.4483959982244414e-06, + "clip_ratio/low_mean": 2.979715202400257e-05, + "clip_ratio/low_min": 4.1597336348786484e-06, + "clip_ratio/region_mean": 3.424554824960069e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15915.0, + "completions/mean_length": 7176.2890625, + "completions/mean_terminated_length": 6801.99169921875, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.9554997384548187, + "epoch": 0.27138914443422263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002781527815386653, + "learning_rate": 1e-05, + "loss": 0.0908, + "num_tokens": 239189385.0, + "reward": 0.5078125, + "reward_std": 0.3634958863258362, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999794960021973, + "sampling/importance_sampling_ratio/min": 0.0018711343873292208, + "sampling/sampling_logp_difference/max": 6.281210422515869, + "sampling/sampling_logp_difference/mean": 0.020436719059944153, + "step": 295 + }, + { + "clip_ratio/high_max": 1.2612186310434481e-05, + "clip_ratio/high_mean": 5.171368570699997e-06, + "clip_ratio/low_mean": 4.8968343890010146e-05, + "clip_ratio/low_min": 4.0222671486844774e-06, + "clip_ratio/region_mean": 5.413971166490228e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16020.0, + "completions/mean_length": 7855.578125, + "completions/mean_terminated_length": 7651.2001953125, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "entropy": 0.9450526610016823, + "epoch": 0.27230910763569455, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003132987068966031, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 240217715.0, + "reward": 0.40625, + "reward_std": 0.28512775897979736, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999253153800964, + "sampling/importance_sampling_ratio/min": 0.0011438478250056505, + "sampling/sampling_logp_difference/max": 6.773357391357422, + "sampling/sampling_logp_difference/mean": 0.021461743861436844, + "step": 296 + }, + { + "clip_ratio/high_max": 2.172341964978841e-05, + "clip_ratio/high_mean": 6.823271291978017e-06, + "clip_ratio/low_mean": 3.516899266742257e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.199226441414794e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14060.0, + "completions/mean_length": 6240.265625, + "completions/mean_terminated_length": 5913.04833984375, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.8811023011803627, + "epoch": 0.2732290708371665, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0028523094952106476, + "learning_rate": 1e-05, + "loss": 0.015, + "num_tokens": 241035133.0, + "reward": 0.484375, + "reward_std": 0.26143303513526917, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.0006931954412721097, + "sampling/sampling_logp_difference/max": 7.274198532104492, + "sampling/sampling_logp_difference/mean": 0.019493088126182556, + "step": 297 + }, + { + "clip_ratio/high_max": 1.2606601558218244e-05, + "clip_ratio/high_mean": 3.151650389554561e-06, + "clip_ratio/low_mean": 3.768150395444536e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.08331545713736e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15795.0, + "completions/mean_length": 6103.203125, + "completions/mean_terminated_length": 6022.251953125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.8766692876815796, + "epoch": 0.27414903403863844, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0026241440791636705, + "learning_rate": 1e-05, + "loss": 0.0089, + "num_tokens": 241836479.0, + "reward": 0.453125, + "reward_std": 0.32589423656463623, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925434589386, + "sampling/importance_sampling_ratio/min": 0.00012664205860346556, + "sampling/sampling_logp_difference/max": 8.974145889282227, + "sampling/sampling_logp_difference/mean": 0.01907728984951973, + "step": 298 + }, + { + "clip_ratio/high_max": 1.7400974911652156e-05, + "clip_ratio/high_mean": 4.350243727913039e-06, + "clip_ratio/low_mean": 4.527119426711579e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.962143839293276e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16169.0, + "completions/mean_length": 7711.0703125, + "completions/mean_terminated_length": 7573.4052734375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 1.0770929008722305, + "epoch": 0.2750689972401104, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003654222236946225, + "learning_rate": 1e-05, + "loss": 0.0443, + "num_tokens": 242844376.0, + "reward": 0.3359375, + "reward_std": 0.2501322627067566, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999839067459106, + "sampling/importance_sampling_ratio/min": 0.0006267472635954618, + "sampling/sampling_logp_difference/max": 7.374967098236084, + "sampling/sampling_logp_difference/mean": 0.022012868896126747, + "step": 299 + }, + { + "clip_ratio/high_max": 1.4325163647299632e-05, + "clip_ratio/high_mean": 3.581290911824908e-06, + "clip_ratio/low_mean": 4.28195745598714e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6400865016948956e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15905.0, + "completions/mean_length": 6616.5546875, + "completions/mean_terminated_length": 6539.6455078125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.8439916148781776, + "epoch": 0.27598896044158233, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0029195898678153753, + "learning_rate": 1e-05, + "loss": 0.1094, + "num_tokens": 243708479.0, + "reward": 0.453125, + "reward_std": 0.3516485095024109, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998997449874878, + "sampling/importance_sampling_ratio/min": 2.189194128732197e-05, + "sampling/sampling_logp_difference/max": 10.729392051696777, + "sampling/sampling_logp_difference/mean": 0.017992788925766945, + "step": 300 + }, + { + "clip_ratio/high_max": 1.848296233220026e-05, + "clip_ratio/high_mean": 4.620740583050065e-06, + "clip_ratio/low_mean": 5.01860952226707e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.480683557834709e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15286.0, + "completions/mean_length": 6173.5234375, + "completions/mean_terminated_length": 6093.1259765625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "entropy": 0.8975192531943321, + "epoch": 0.2769089236430543, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017261393368244171, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 244515378.0, + "reward": 0.53125, + "reward_std": 0.3532412052154541, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999533891677856, + "sampling/importance_sampling_ratio/min": 0.000553854217287153, + "sampling/sampling_logp_difference/max": 7.4986090660095215, + "sampling/sampling_logp_difference/mean": 0.019458644092082977, + "step": 301 + }, + { + "clip_ratio/high_max": 4.114005332667148e-05, + "clip_ratio/high_mean": 1.2276760230633954e-05, + "clip_ratio/low_mean": 3.397437080820964e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.625113024303573e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16032.0, + "completions/mean_length": 5640.90625, + "completions/mean_terminated_length": 5470.38134765625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.8833519890904427, + "epoch": 0.2778288868445262, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018768958980217576, + "learning_rate": 1e-05, + "loss": 0.0731, + "num_tokens": 245258318.0, + "reward": 0.4609375, + "reward_std": 0.3135277330875397, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999516606330872, + "sampling/importance_sampling_ratio/min": 0.0071789538487792015, + "sampling/sampling_logp_difference/max": 4.936601638793945, + "sampling/sampling_logp_difference/mean": 0.019646335393190384, + "step": 302 + }, + { + "clip_ratio/high_max": 1.4196921938491869e-05, + "clip_ratio/high_mean": 4.514302474944998e-06, + "clip_ratio/low_mean": 4.4677519781544106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.919182129015098e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16378.0, + "completions/mean_length": 7840.5078125, + "completions/mean_terminated_length": 7564.9111328125, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.9772802665829659, + "epoch": 0.27874885004599814, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002617602702230215, + "learning_rate": 1e-05, + "loss": 0.0298, + "num_tokens": 246280663.0, + "reward": 0.328125, + "reward_std": 0.29826050996780396, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.0008982301224023104, + "sampling/sampling_logp_difference/max": 7.015084266662598, + "sampling/sampling_logp_difference/mean": 0.022171074524521828, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7621316146687604e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7621316146687604e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 6316.1015625, + "completions/mean_terminated_length": 6074.47216796875, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.8542795851826668, + "epoch": 0.2796688132474701, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0011874900665134192, + "learning_rate": 1e-05, + "loss": 0.0513, + "num_tokens": 247107604.0, + "reward": 0.3828125, + "reward_std": 0.2227931022644043, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000126361846924, + "sampling/importance_sampling_ratio/min": 0.00015846268797758967, + "sampling/sampling_logp_difference/max": 8.749991416931152, + "sampling/sampling_logp_difference/mean": 0.018691308796405792, + "step": 304 + }, + { + "clip_ratio/high_max": 3.0959752166381804e-06, + "clip_ratio/high_mean": 7.739938041595451e-07, + "clip_ratio/low_mean": 6.0967123090449604e-05, + "clip_ratio/low_min": 2.711407751121442e-05, + "clip_ratio/region_mean": 6.17411176335736e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6568.171875, + "completions/mean_terminated_length": 6412.365234375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.9063890501856804, + "epoch": 0.28058877644894203, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002459619427099824, + "learning_rate": 1e-05, + "loss": 0.0725, + "num_tokens": 247967322.0, + "reward": 0.5, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998743534088135, + "sampling/importance_sampling_ratio/min": 0.012350871227681637, + "sampling/sampling_logp_difference/max": 4.394028663635254, + "sampling/sampling_logp_difference/mean": 0.020134467631578445, + "step": 305 + }, + { + "clip_ratio/high_max": 5.9507838159333915e-06, + "clip_ratio/high_mean": 1.4876959539833479e-06, + "clip_ratio/low_mean": 2.400908408617397e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.549678004015732e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15714.0, + "completions/mean_length": 8182.28125, + "completions/mean_terminated_length": 7635.50048828125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 1.0137704983353615, + "epoch": 0.281508739650414, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016673406353220344, + "learning_rate": 1e-05, + "loss": 0.0244, + "num_tokens": 249031710.0, + "reward": 0.3359375, + "reward_std": 0.22225631773471832, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998995065689087, + "sampling/importance_sampling_ratio/min": 0.0008049134048633277, + "sampling/sampling_logp_difference/max": 7.1247758865356445, + "sampling/sampling_logp_difference/mean": 0.021704845130443573, + "step": 306 + }, + { + "clip_ratio/high_max": 1.4527202438330278e-05, + "clip_ratio/high_mean": 3.6318006095825695e-06, + "clip_ratio/low_mean": 3.1829216595724574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.5461017205307144e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14502.0, + "completions/max_terminated_length": 14502.0, + "completions/mean_length": 6460.5703125, + "completions/mean_terminated_length": 6460.5703125, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "entropy": 1.0418165400624275, + "epoch": 0.2824287028518859, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022682021372020245, + "learning_rate": 1e-05, + "loss": 0.0171, + "num_tokens": 249881047.0, + "reward": 0.359375, + "reward_std": 0.25566887855529785, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999744296073914, + "sampling/importance_sampling_ratio/min": 0.002809183904901147, + "sampling/sampling_logp_difference/max": 5.874861240386963, + "sampling/sampling_logp_difference/mean": 0.02204791083931923, + "step": 307 + }, + { + "clip_ratio/high_max": 9.222687367582694e-06, + "clip_ratio/high_mean": 4.125313353142701e-06, + "clip_ratio/low_mean": 4.836107154915226e-05, + "clip_ratio/low_min": 3.4611657611094415e-06, + "clip_ratio/region_mean": 5.248638444754761e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14754.0, + "completions/mean_length": 6846.3046875, + "completions/mean_terminated_length": 6694.9130859375, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "entropy": 0.9839218333363533, + "epoch": 0.28334866605335784, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.002436346374452114, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 250773806.0, + "reward": 0.484375, + "reward_std": 0.34299150109291077, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999980628490448, + "sampling/importance_sampling_ratio/min": 0.0257408544421196, + "sampling/sampling_logp_difference/max": 3.6596758365631104, + "sampling/sampling_logp_difference/mean": 0.02135510742664337, + "step": 308 + }, + { + "clip_ratio/high_max": 1.3327621218195418e-05, + "clip_ratio/high_mean": 3.3319053045488545e-06, + "clip_ratio/low_mean": 3.791964286392613e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1251548054788145e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15777.0, + "completions/mean_length": 6558.53125, + "completions/mean_terminated_length": 6241.58056640625, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "entropy": 0.7833076938986778, + "epoch": 0.2842686292548298, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002015948062762618, + "learning_rate": 1e-05, + "loss": 0.0791, + "num_tokens": 251633074.0, + "reward": 0.46875, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 5.1445105782477185e-05, + "sampling/sampling_logp_difference/max": 9.874995231628418, + "sampling/sampling_logp_difference/mean": 0.017078280448913574, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.3865982686620555e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3865982686620555e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 7626.390625, + "completions/mean_terminated_length": 7487.38134765625, + "completions/min_length": 1400.0, + "completions/min_terminated_length": 1400.0, + "entropy": 0.8946382254362106, + "epoch": 0.28518859245630174, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001098336186259985, + "learning_rate": 1e-05, + "loss": 0.042, + "num_tokens": 252629300.0, + "reward": 0.3359375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 0.00021643216314259917, + "sampling/sampling_logp_difference/max": 8.438233375549316, + "sampling/sampling_logp_difference/mean": 0.01972624473273754, + "step": 310 + }, + { + "clip_ratio/high_max": 6.5777783220255515e-06, + "clip_ratio/high_mean": 1.6444445805063879e-06, + "clip_ratio/low_mean": 1.7658890669736138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9303335250242526e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15763.0, + "completions/mean_length": 5796.984375, + "completions/mean_terminated_length": 5713.6220703125, + "completions/min_length": 528.0, + "completions/min_terminated_length": 528.0, + "entropy": 0.969724528491497, + "epoch": 0.2861085556577737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.003871417138725519, + "learning_rate": 1e-05, + "loss": 0.0408, + "num_tokens": 253389562.0, + "reward": 0.484375, + "reward_std": 0.23752351105213165, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998880624771118, + "sampling/importance_sampling_ratio/min": 2.4301782104885206e-05, + "sampling/sampling_logp_difference/max": 10.624960899353027, + "sampling/sampling_logp_difference/mean": 0.019220752641558647, + "step": 311 + }, + { + "clip_ratio/high_max": 8.099077376755304e-06, + "clip_ratio/high_mean": 2.8300572125772305e-06, + "clip_ratio/low_mean": 3.2033483023496956e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.486354006554393e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15769.0, + "completions/mean_length": 6938.5625, + "completions/mean_terminated_length": 6788.63525390625, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9812447279691696, + "epoch": 0.28702851885924563, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002257548039779067, + "learning_rate": 1e-05, + "loss": -0.0089, + "num_tokens": 254295858.0, + "reward": 0.4140625, + "reward_std": 0.2596206068992615, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000464916229248, + "sampling/importance_sampling_ratio/min": 0.0009388317703269422, + "sampling/sampling_logp_difference/max": 6.970874309539795, + "sampling/sampling_logp_difference/mean": 0.02080199122428894, + "step": 312 + }, + { + "clip_ratio/high_max": 4.441917553776875e-06, + "clip_ratio/high_mean": 1.1104793884442188e-06, + "clip_ratio/low_mean": 3.414505465570983e-05, + "clip_ratio/low_min": 3.790060873143375e-06, + "clip_ratio/region_mean": 3.5255534044154047e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15163.0, + "completions/mean_length": 6878.15625, + "completions/mean_terminated_length": 6650.01611328125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9106859937310219, + "epoch": 0.28794848206071755, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.00420041661709547, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 255197110.0, + "reward": 0.421875, + "reward_std": 0.30433881282806396, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999925494194031, + "sampling/importance_sampling_ratio/min": 0.015217061154544353, + "sampling/sampling_logp_difference/max": 4.185338020324707, + "sampling/sampling_logp_difference/mean": 0.02016574889421463, + "step": 313 + }, + { + "clip_ratio/high_max": 8.814751254249131e-06, + "clip_ratio/high_mean": 2.203687813562283e-06, + "clip_ratio/low_mean": 3.137724206681014e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.3580929766685585e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14462.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 6260.2578125, + "completions/mean_terminated_length": 6260.2578125, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.9523455575108528, + "epoch": 0.2888684452621895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027907798066735268, + "learning_rate": 1e-05, + "loss": 0.0302, + "num_tokens": 256018935.0, + "reward": 0.421875, + "reward_std": 0.2659186124801636, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 7.485197420464829e-05, + "sampling/sampling_logp_difference/max": 9.499998092651367, + "sampling/sampling_logp_difference/mean": 0.0191945917904377, + "step": 314 + }, + { + "clip_ratio/high_max": 2.8685263259831117e-05, + "clip_ratio/high_mean": 7.171315814957779e-06, + "clip_ratio/low_mean": 2.780131131885355e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.497262770224552e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 6202.828125, + "completions/mean_terminated_length": 6041.22265625, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.8513326346874237, + "epoch": 0.28978840846366144, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023744129575788975, + "learning_rate": 1e-05, + "loss": 0.0379, + "num_tokens": 256841129.0, + "reward": 0.5625, + "reward_std": 0.32407689094543457, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000146627426147, + "sampling/importance_sampling_ratio/min": 9.269781003240496e-06, + "sampling/sampling_logp_difference/max": 11.588750839233398, + "sampling/sampling_logp_difference/mean": 0.019519174471497536, + "step": 315 + }, + { + "clip_ratio/high_max": 1.6381697605538648e-05, + "clip_ratio/high_mean": 4.095424401384662e-06, + "clip_ratio/low_mean": 3.0394592840821133e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.449001792432682e-05, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16328.0, + "completions/mean_length": 8019.4609375, + "completions/mean_terminated_length": 7073.90380859375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.9211000874638557, + "epoch": 0.2907083716651334, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024705040268599987, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 257884188.0, + "reward": 0.3046875, + "reward_std": 0.2869499623775482, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999315738677979, + "sampling/importance_sampling_ratio/min": 0.016358470544219017, + "sampling/sampling_logp_difference/max": 4.113009452819824, + "sampling/sampling_logp_difference/mean": 0.01984308287501335, + "step": 316 + }, + { + "clip_ratio/high_max": 7.485402420570608e-06, + "clip_ratio/high_mean": 1.871350605142652e-06, + "clip_ratio/low_mean": 3.025547425750119e-05, + "clip_ratio/low_min": 2.697337095014518e-06, + "clip_ratio/region_mean": 3.212682509001752e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15210.0, + "completions/mean_length": 7257.6875, + "completions/mean_terminated_length": 7038.65625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.8801277950406075, + "epoch": 0.29162833486660533, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032848953269422054, + "learning_rate": 1e-05, + "loss": 0.0305, + "num_tokens": 258831852.0, + "reward": 0.4296875, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 0.00019848966621793807, + "sampling/sampling_logp_difference/max": 8.524773597717285, + "sampling/sampling_logp_difference/mean": 0.019743187353014946, + "step": 317 + }, + { + "clip_ratio/high_max": 1.52771035573096e-05, + "clip_ratio/high_mean": 3.8192758893274e-06, + "clip_ratio/low_mean": 3.605492440783564e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.987420052453672e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14903.0, + "completions/mean_length": 6042.84375, + "completions/mean_terminated_length": 5878.69873046875, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.8792382404208183, + "epoch": 0.29254829806807725, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.004201764706522226, + "learning_rate": 1e-05, + "loss": 0.099, + "num_tokens": 259623512.0, + "reward": 0.640625, + "reward_std": 0.3913668990135193, + "rewards/accuracy_reward/mean": 0.640625, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998612403869629, + "sampling/importance_sampling_ratio/min": 0.00027811730979010463, + "sampling/sampling_logp_difference/max": 8.187467575073242, + "sampling/sampling_logp_difference/mean": 0.018901977688074112, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.1642084397608414e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1642084397608414e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16381.0, + "completions/mean_length": 7667.6875, + "completions/mean_terminated_length": 7458.49658203125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9096411988139153, + "epoch": 0.2934682612695492, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014557713875547051, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 260623928.0, + "reward": 0.3515625, + "reward_std": 0.22726887464523315, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999264478683472, + "sampling/importance_sampling_ratio/min": 0.0002615761768538505, + "sampling/sampling_logp_difference/max": 8.248785018920898, + "sampling/sampling_logp_difference/mean": 0.01979639381170273, + "step": 319 + }, + { + "clip_ratio/high_max": 2.36019068324822e-05, + "clip_ratio/high_mean": 5.90047670812055e-06, + "clip_ratio/low_mean": 2.704614530557592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2946622809504333e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15514.0, + "completions/max_terminated_length": 15514.0, + "completions/mean_length": 6428.8203125, + "completions/mean_terminated_length": 6428.8203125, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.9974069148302078, + "epoch": 0.29438822447102114, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0028210312593728304, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 261465625.0, + "reward": 0.46875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000195503234863, + "sampling/importance_sampling_ratio/min": 0.001225265790708363, + "sampling/sampling_logp_difference/max": 6.704597473144531, + "sampling/sampling_logp_difference/mean": 0.021066997200250626, + "step": 320 + }, + { + "clip_ratio/high_max": 2.9634452857862925e-05, + "clip_ratio/high_mean": 7.408613214465731e-06, + "clip_ratio/low_mean": 3.7066520235384814e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.447513333616371e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15224.0, + "completions/mean_length": 5664.515625, + "completions/mean_terminated_length": 5580.1103515625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.9557281509041786, + "epoch": 0.2953081876724931, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024263609666377306, + "learning_rate": 1e-05, + "loss": 0.0357, + "num_tokens": 262208475.0, + "reward": 0.4765625, + "reward_std": 0.26409637928009033, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998950958251953, + "sampling/importance_sampling_ratio/min": 0.0001059407222783193, + "sampling/sampling_logp_difference/max": 9.152630805969238, + "sampling/sampling_logp_difference/mean": 0.01997508481144905, + "step": 321 + }, + { + "clip_ratio/high_max": 1.9527269159880234e-05, + "clip_ratio/high_mean": 5.685056066795369e-06, + "clip_ratio/low_mean": 4.980480150607036e-05, + "clip_ratio/low_min": 5.136423624207964e-06, + "clip_ratio/region_mean": 5.5489856435997353e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15566.0, + "completions/mean_length": 6649.5390625, + "completions/mean_terminated_length": 6170.794921875, + "completions/min_length": 599.0, + "completions/min_terminated_length": 599.0, + "entropy": 0.9003193452954292, + "epoch": 0.29622815087396503, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025556792970746756, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 263078672.0, + "reward": 0.453125, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998431205749512, + "sampling/importance_sampling_ratio/min": 3.631301660789177e-05, + "sampling/sampling_logp_difference/max": 10.223334312438965, + "sampling/sampling_logp_difference/mean": 0.019613387063145638, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.1492368912513484e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.1492368912513484e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15190.0, + "completions/mean_length": 5819.4140625, + "completions/mean_terminated_length": 5478.62060546875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "entropy": 0.9234923645853996, + "epoch": 0.297148114075437, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0008845282136462629, + "learning_rate": 1e-05, + "loss": 0.077, + "num_tokens": 263843797.0, + "reward": 0.5390625, + "reward_std": 0.14913026988506317, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999452233314514, + "sampling/importance_sampling_ratio/min": 0.06759586930274963, + "sampling/sampling_logp_difference/max": 2.6942083835601807, + "sampling/sampling_logp_difference/mean": 0.02007308602333069, + "step": 323 + }, + { + "clip_ratio/high_max": 1.1687909363899962e-05, + "clip_ratio/high_mean": 2.9219773409749905e-06, + "clip_ratio/low_mean": 2.420720869622528e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7129186207730527e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16033.0, + "completions/mean_length": 6952.96875, + "completions/mean_terminated_length": 6726.62451171875, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.8909401148557663, + "epoch": 0.2980680772769089, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001527746208012104, + "learning_rate": 1e-05, + "loss": 0.0633, + "num_tokens": 264751769.0, + "reward": 0.453125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999520778656006, + "sampling/importance_sampling_ratio/min": 0.000505264790263027, + "sampling/sampling_logp_difference/max": 7.590427875518799, + "sampling/sampling_logp_difference/mean": 0.019622590392827988, + "step": 324 + }, + { + "clip_ratio/high_max": 1.5079081094881985e-05, + "clip_ratio/high_mean": 4.600909505825257e-06, + "clip_ratio/low_mean": 5.333864191925386e-05, + "clip_ratio/low_min": 5.043169494456379e-06, + "clip_ratio/region_mean": 5.793955187982647e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15667.0, + "completions/mean_length": 8138.5234375, + "completions/mean_terminated_length": 7733.0078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.972789965569973, + "epoch": 0.29898804047838085, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003113618353381753, + "learning_rate": 1e-05, + "loss": 0.0771, + "num_tokens": 265810580.0, + "reward": 0.40625, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998828172683716, + "sampling/importance_sampling_ratio/min": 9.312124404914357e-08, + "sampling/sampling_logp_difference/max": 16.189363479614258, + "sampling/sampling_logp_difference/mean": 0.02168515883386135, + "step": 325 + }, + { + "clip_ratio/high_max": 4.463807272259146e-06, + "clip_ratio/high_mean": 1.1159518180647865e-06, + "clip_ratio/low_mean": 3.45970811395091e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.571303295757389e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16248.0, + "completions/mean_length": 7565.6015625, + "completions/mean_terminated_length": 7131.90966796875, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "entropy": 0.835600845515728, + "epoch": 0.2999080036798528, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0009589543915353715, + "learning_rate": 1e-05, + "loss": 0.0509, + "num_tokens": 266796097.0, + "reward": 0.5078125, + "reward_std": 0.16834920644760132, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999743700027466, + "sampling/importance_sampling_ratio/min": 0.0017039220547303557, + "sampling/sampling_logp_difference/max": 6.374822616577148, + "sampling/sampling_logp_difference/mean": 0.01885361596941948, + "step": 326 + }, + { + "clip_ratio/high_max": 2.260646033391822e-05, + "clip_ratio/high_mean": 5.651615083479555e-06, + "clip_ratio/low_mean": 5.806843591926736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.372005145749426e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16167.0, + "completions/mean_length": 7124.0546875, + "completions/mean_terminated_length": 6668.64697265625, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.9041655585169792, + "epoch": 0.30082796688132474, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024741124361753464, + "learning_rate": 1e-05, + "loss": 0.0514, + "num_tokens": 267727528.0, + "reward": 0.4296875, + "reward_std": 0.23592591285705566, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999866247177124, + "sampling/importance_sampling_ratio/min": 4.63160322397016e-05, + "sampling/sampling_logp_difference/max": 9.980022430419922, + "sampling/sampling_logp_difference/mean": 0.01998118683695793, + "step": 327 + }, + { + "clip_ratio/high_max": 1.7461054540035548e-05, + "clip_ratio/high_mean": 5.456775966194982e-06, + "clip_ratio/low_mean": 3.374219397755951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.919897017112817e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14834.0, + "completions/mean_length": 6757.65625, + "completions/mean_terminated_length": 6681.8583984375, + "completions/min_length": 1123.0, + "completions/min_terminated_length": 1123.0, + "entropy": 1.105302907526493, + "epoch": 0.3017479300827967, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002233455190435052, + "learning_rate": 1e-05, + "loss": 0.0147, + "num_tokens": 268610868.0, + "reward": 0.375, + "reward_std": 0.23857943713665009, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 3.3169128528243164e-06, + "sampling/sampling_logp_difference/max": 12.616476058959961, + "sampling/sampling_logp_difference/mean": 0.021600255742669106, + "step": 328 + }, + { + "clip_ratio/high_max": 1.7514204046165105e-05, + "clip_ratio/high_mean": 4.378551011541276e-06, + "clip_ratio/low_mean": 4.300070588669769e-05, + "clip_ratio/low_min": 3.6705330330732977e-06, + "clip_ratio/region_mean": 4.7379256784552126e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16147.0, + "completions/mean_length": 7522.5546875, + "completions/mean_terminated_length": 7381.8974609375, + "completions/min_length": 1390.0, + "completions/min_terminated_length": 1390.0, + "entropy": 1.0577925741672516, + "epoch": 0.30266789328426863, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017964976141229272, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 269594867.0, + "reward": 0.421875, + "reward_std": 0.28223684430122375, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999937891960144, + "sampling/importance_sampling_ratio/min": 0.002247168216854334, + "sampling/sampling_logp_difference/max": 6.098084449768066, + "sampling/sampling_logp_difference/mean": 0.021326296031475067, + "step": 329 + }, + { + "clip_ratio/high_max": 1.7011016097967513e-05, + "clip_ratio/high_mean": 4.252754024491878e-06, + "clip_ratio/low_mean": 2.5991578013417893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0244332265283447e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14462.0, + "completions/mean_length": 6232.109375, + "completions/mean_terminated_length": 5904.62890625, + "completions/min_length": 1238.0, + "completions/min_terminated_length": 1238.0, + "entropy": 0.8473618850111961, + "epoch": 0.30358785648574055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023369218688458204, + "learning_rate": 1e-05, + "loss": 0.0291, + "num_tokens": 270410785.0, + "reward": 0.6015625, + "reward_std": 0.23516449332237244, + "rewards/accuracy_reward/mean": 0.6015625, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000063180923462, + "sampling/importance_sampling_ratio/min": 0.00010575528722256422, + "sampling/sampling_logp_difference/max": 9.154382705688477, + "sampling/sampling_logp_difference/mean": 0.018453873693943024, + "step": 330 + }, + { + "clip_ratio/high_max": 1.2072427125531249e-05, + "clip_ratio/high_mean": 4.300789669287042e-06, + "clip_ratio/low_mean": 3.064826853460545e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.4949058090205654e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14454.0, + "completions/max_terminated_length": 14454.0, + "completions/mean_length": 5847.0625, + "completions/mean_terminated_length": 5847.0625, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.8186105340719223, + "epoch": 0.3045078196872125, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0014558705734089017, + "learning_rate": 1e-05, + "loss": 0.0672, + "num_tokens": 271179113.0, + "reward": 0.5390625, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.5390625, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000067114830017, + "sampling/importance_sampling_ratio/min": 1.994453305087518e-05, + "sampling/sampling_logp_difference/max": 10.822555541992188, + "sampling/sampling_logp_difference/mean": 0.017629161477088928, + "step": 331 + }, + { + "clip_ratio/high_max": 3.204624090358266e-05, + "clip_ratio/high_mean": 8.719567063053546e-06, + "clip_ratio/low_mean": 5.131868192620459e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 6.0038249102944974e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16065.0, + "completions/mean_length": 6670.6015625, + "completions/mean_terminated_length": 6516.4208984375, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "entropy": 0.9379853457212448, + "epoch": 0.30542778288868444, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002496426459401846, + "learning_rate": 1e-05, + "loss": 0.051, + "num_tokens": 272054510.0, + "reward": 0.328125, + "reward_std": 0.29932624101638794, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998705387115479, + "sampling/importance_sampling_ratio/min": 0.00010894420120166615, + "sampling/sampling_logp_difference/max": 9.124674797058105, + "sampling/sampling_logp_difference/mean": 0.020175442099571228, + "step": 332 + }, + { + "clip_ratio/high_max": 1.1311959497106727e-05, + "clip_ratio/high_mean": 2.827989874276682e-06, + "clip_ratio/low_mean": 6.672416202491149e-05, + "clip_ratio/low_min": 4.344501576269977e-06, + "clip_ratio/region_mean": 6.955215212656185e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15716.0, + "completions/max_terminated_length": 15716.0, + "completions/mean_length": 6613.328125, + "completions/mean_terminated_length": 6613.328125, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "entropy": 1.0781218782067299, + "epoch": 0.3063477460901564, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028466631192713976, + "learning_rate": 1e-05, + "loss": 0.0257, + "num_tokens": 272920304.0, + "reward": 0.3359375, + "reward_std": 0.32089439034461975, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999476671218872, + "sampling/importance_sampling_ratio/min": 0.02985518053174019, + "sampling/sampling_logp_difference/max": 3.511396884918213, + "sampling/sampling_logp_difference/mean": 0.02250460349023342, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.3429964585375274e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3429964585375274e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15572.0, + "completions/mean_length": 6632.78125, + "completions/mean_terminated_length": 6318.2255859375, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "entropy": 0.9595735669136047, + "epoch": 0.30726770929162833, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0027409526519477367, + "learning_rate": 1e-05, + "loss": 0.0564, + "num_tokens": 273789588.0, + "reward": 0.3671875, + "reward_std": 0.12863078713417053, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999409914016724, + "sampling/importance_sampling_ratio/min": 8.484355930704623e-05, + "sampling/sampling_logp_difference/max": 9.374701499938965, + "sampling/sampling_logp_difference/mean": 0.02000725269317627, + "step": 334 + }, + { + "clip_ratio/high_max": 1.0485138318472309e-05, + "clip_ratio/high_mean": 2.6212845796180773e-06, + "clip_ratio/low_mean": 6.270217818382662e-05, + "clip_ratio/low_min": 1.282997527596308e-05, + "clip_ratio/region_mean": 6.532346287713153e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15567.0, + "completions/mean_length": 8083.421875, + "completions/mean_terminated_length": 7884.20849609375, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "entropy": 1.139024168252945, + "epoch": 0.30818767249310025, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001853835303336382, + "learning_rate": 1e-05, + "loss": 0.0521, + "num_tokens": 274843754.0, + "reward": 0.2734375, + "reward_std": 0.29719969630241394, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999961256980896, + "sampling/importance_sampling_ratio/min": 6.099340225773631e-06, + "sampling/sampling_logp_difference/max": 12.007329940795898, + "sampling/sampling_logp_difference/mean": 0.023757295683026314, + "step": 335 + }, + { + "clip_ratio/high_max": 6.558237146236934e-06, + "clip_ratio/high_mean": 1.6395592865592334e-06, + "clip_ratio/low_mean": 3.2649955073793535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.428951481510012e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16058.0, + "completions/max_terminated_length": 16058.0, + "completions/mean_length": 6932.6640625, + "completions/mean_terminated_length": 6932.6640625, + "completions/min_length": 666.0, + "completions/min_terminated_length": 666.0, + "entropy": 1.2969390451908112, + "epoch": 0.3091076356945722, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002049664966762066, + "learning_rate": 1e-05, + "loss": 0.0179, + "num_tokens": 275750023.0, + "reward": 0.21875, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000157356262207, + "sampling/importance_sampling_ratio/min": 5.287989188218489e-05, + "sampling/sampling_logp_difference/max": 9.847487449645996, + "sampling/sampling_logp_difference/mean": 0.021840902045369148, + "step": 336 + }, + { + "clip_ratio/high_max": 5.1826359594997484e-06, + "clip_ratio/high_mean": 1.2956589898749371e-06, + "clip_ratio/low_mean": 3.607215444390022e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.736781377483567e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15213.0, + "completions/mean_length": 7630.65625, + "completions/mean_terminated_length": 7124.26416015625, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.959126852452755, + "epoch": 0.31002759889604414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030745298136025667, + "learning_rate": 1e-05, + "loss": 0.0487, + "num_tokens": 276750011.0, + "reward": 0.3125, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728798866272, + "sampling/importance_sampling_ratio/min": 3.149233089061454e-05, + "sampling/sampling_logp_difference/max": 10.365766525268555, + "sampling/sampling_logp_difference/mean": 0.021394159644842148, + "step": 337 + }, + { + "clip_ratio/high_max": 6.921764679646003e-06, + "clip_ratio/high_mean": 2.5604765028219845e-06, + "clip_ratio/low_mean": 2.64957521380893e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.905622847038103e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15907.0, + "completions/mean_length": 7383.2421875, + "completions/mean_terminated_length": 7240.37353515625, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 1.1512386053800583, + "epoch": 0.3109475620975161, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0014476332580670714, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 277715450.0, + "reward": 0.4140625, + "reward_std": 0.2477683424949646, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750256538391, + "sampling/importance_sampling_ratio/min": 4.5251621486386284e-05, + "sampling/sampling_logp_difference/max": 10.00327205657959, + "sampling/sampling_logp_difference/mean": 0.020672230049967766, + "step": 338 + }, + { + "clip_ratio/high_max": 3.7021679872850655e-06, + "clip_ratio/high_mean": 9.255419968212664e-07, + "clip_ratio/low_mean": 3.8645826748506806e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.957136880217149e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14693.0, + "completions/mean_length": 5876.421875, + "completions/mean_terminated_length": 5793.68505859375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 1.0786077454686165, + "epoch": 0.31186752529898804, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018895689863711596, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 278491688.0, + "reward": 0.3984375, + "reward_std": 0.21146979928016663, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998824596405029, + "sampling/importance_sampling_ratio/min": 0.0007111100130714476, + "sampling/sampling_logp_difference/max": 7.248683452606201, + "sampling/sampling_logp_difference/mean": 0.020282316952943802, + "step": 339 + }, + { + "clip_ratio/high_max": 1.8740533050731756e-05, + "clip_ratio/high_mean": 4.685133262682939e-06, + "clip_ratio/low_mean": 2.9699310402975243e-05, + "clip_ratio/low_min": 4.435140454006614e-06, + "clip_ratio/region_mean": 3.4384443438284507e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14627.0, + "completions/mean_length": 7162.625, + "completions/mean_terminated_length": 6709.1142578125, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "entropy": 0.898807168006897, + "epoch": 0.31278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002133915899321437, + "learning_rate": 1e-05, + "loss": 0.0222, + "num_tokens": 279427384.0, + "reward": 0.4453125, + "reward_std": 0.32142335176467896, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.004845126066356897, + "sampling/sampling_logp_difference/max": 5.329782009124756, + "sampling/sampling_logp_difference/mean": 0.019643021747469902, + "step": 340 + }, + { + "clip_ratio/high_max": 1.472241683586617e-05, + "clip_ratio/high_mean": 5.561973125622899e-06, + "clip_ratio/low_mean": 6.452910844245707e-05, + "clip_ratio/low_min": 9.302988473791629e-06, + "clip_ratio/region_mean": 7.009108327338254e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15965.0, + "completions/mean_length": 7072.3828125, + "completions/mean_terminated_length": 6999.06298828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.8942967653274536, + "epoch": 0.3137074517019319, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023624920286238194, + "learning_rate": 1e-05, + "loss": 0.0866, + "num_tokens": 280352177.0, + "reward": 0.375, + "reward_std": 0.36637401580810547, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999604225158691, + "sampling/importance_sampling_ratio/min": 0.0008250995306298137, + "sampling/sampling_logp_difference/max": 7.100006580352783, + "sampling/sampling_logp_difference/mean": 0.020037520676851273, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.717265596809739e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.717265596809739e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16300.0, + "completions/max_terminated_length": 16300.0, + "completions/mean_length": 6553.203125, + "completions/mean_terminated_length": 6553.203125, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.8765531405806541, + "epoch": 0.31462741490340385, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0025228122249245644, + "learning_rate": 1e-05, + "loss": 0.0539, + "num_tokens": 281208411.0, + "reward": 0.40625, + "reward_std": 0.3390446603298187, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999656677246094, + "sampling/importance_sampling_ratio/min": 0.00030091358348727226, + "sampling/sampling_logp_difference/max": 8.108687400817871, + "sampling/sampling_logp_difference/mean": 0.018958289176225662, + "step": 342 + }, + { + "clip_ratio/high_max": 1.5562100998067763e-05, + "clip_ratio/high_mean": 3.890525249516941e-06, + "clip_ratio/low_mean": 6.593948137378902e-05, + "clip_ratio/low_min": 1.4238520634535234e-05, + "clip_ratio/region_mean": 6.983000685067964e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14916.0, + "completions/mean_length": 6489.40625, + "completions/mean_terminated_length": 6087.1865234375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.8384068235754967, + "epoch": 0.3155473781048758, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.003243578365072608, + "learning_rate": 1e-05, + "loss": 0.119, + "num_tokens": 282059863.0, + "reward": 0.515625, + "reward_std": 0.39689862728118896, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999314546585083, + "sampling/importance_sampling_ratio/min": 0.00026549631729722023, + "sampling/sampling_logp_difference/max": 8.233909606933594, + "sampling/sampling_logp_difference/mean": 0.01820875145494938, + "step": 343 + }, + { + "clip_ratio/high_max": 4.114007424504962e-06, + "clip_ratio/high_mean": 1.0285018561262405e-06, + "clip_ratio/low_mean": 3.0735714062757324e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.176421569150989e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15859.0, + "completions/max_terminated_length": 15859.0, + "completions/mean_length": 7148.7890625, + "completions/mean_terminated_length": 7148.7890625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 1.0214989855885506, + "epoch": 0.31646734130634774, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0027867467142641544, + "learning_rate": 1e-05, + "loss": 0.0445, + "num_tokens": 282994036.0, + "reward": 0.4921875, + "reward_std": 0.28511500358581543, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999145269393921, + "sampling/importance_sampling_ratio/min": 0.027774186804890633, + "sampling/sampling_logp_difference/max": 3.583648204803467, + "sampling/sampling_logp_difference/mean": 0.0217401385307312, + "step": 344 + }, + { + "clip_ratio/high_max": 1.6063933799159713e-05, + "clip_ratio/high_mean": 5.513276278179546e-06, + "clip_ratio/low_mean": 4.230772367463942e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.782100086231367e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16091.0, + "completions/max_terminated_length": 16091.0, + "completions/mean_length": 5532.1328125, + "completions/mean_terminated_length": 5532.1328125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.9303388148546219, + "epoch": 0.3173873045078197, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0024432060308754444, + "learning_rate": 1e-05, + "loss": 0.0251, + "num_tokens": 283723605.0, + "reward": 0.421875, + "reward_std": 0.38717782497406006, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819993972778, + "sampling/importance_sampling_ratio/min": 0.011936242692172527, + "sampling/sampling_logp_difference/max": 4.428175926208496, + "sampling/sampling_logp_difference/mean": 0.019281461834907532, + "step": 345 + }, + { + "clip_ratio/high_max": 6.218693215487292e-06, + "clip_ratio/high_mean": 1.554673303871823e-06, + "clip_ratio/low_mean": 1.5384349637770356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6939022600581666e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15933.0, + "completions/mean_length": 6830.09375, + "completions/mean_terminated_length": 6441.72314453125, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.9551377296447754, + "epoch": 0.31830726770929163, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0031446516513824463, + "learning_rate": 1e-05, + "loss": -0.0037, + "num_tokens": 284617089.0, + "reward": 0.3671875, + "reward_std": 0.20911568403244019, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999873042106628, + "sampling/importance_sampling_ratio/min": 0.0007485119276680052, + "sampling/sampling_logp_difference/max": 7.197423458099365, + "sampling/sampling_logp_difference/mean": 0.01985902711749077, + "step": 346 + }, + { + "clip_ratio/high_max": 7.772906428726856e-06, + "clip_ratio/high_mean": 2.8712697712762747e-06, + "clip_ratio/low_mean": 3.287052913947264e-05, + "clip_ratio/low_min": 2.789369091260596e-06, + "clip_ratio/region_mean": 3.574179936549626e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15613.0, + "completions/mean_length": 6557.3515625, + "completions/mean_terminated_length": 6401.37353515625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 1.0254710763692856, + "epoch": 0.31922723091076355, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0024617225863039494, + "learning_rate": 1e-05, + "loss": 0.0669, + "num_tokens": 285475910.0, + "reward": 0.390625, + "reward_std": 0.2761683464050293, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999537467956543, + "sampling/importance_sampling_ratio/min": 0.006225659977644682, + "sampling/sampling_logp_difference/max": 5.079075813293457, + "sampling/sampling_logp_difference/mean": 0.021138068288564682, + "step": 347 + }, + { + "clip_ratio/high_max": 1.0258745533064939e-05, + "clip_ratio/high_mean": 3.588538106669148e-06, + "clip_ratio/low_mean": 6.333507008093875e-05, + "clip_ratio/low_min": 4.415712737682043e-06, + "clip_ratio/region_mean": 6.692360875604209e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15851.0, + "completions/mean_length": 7379.140625, + "completions/mean_terminated_length": 7088.6611328125, + "completions/min_length": 1243.0, + "completions/min_terminated_length": 1243.0, + "entropy": 0.9518962875008583, + "epoch": 0.3201471941122355, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017496495274826884, + "learning_rate": 1e-05, + "loss": 0.0734, + "num_tokens": 286439696.0, + "reward": 0.390625, + "reward_std": 0.26538965106010437, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999358654022217, + "sampling/importance_sampling_ratio/min": 0.006735759321600199, + "sampling/sampling_logp_difference/max": 5.000324726104736, + "sampling/sampling_logp_difference/mean": 0.021384600549936295, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.854056094747648e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.854056094747648e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 5960.140625, + "completions/mean_terminated_length": 5878.06298828125, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "entropy": 0.9556702002882957, + "epoch": 0.32106715731370744, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013999518705531955, + "learning_rate": 1e-05, + "loss": 0.0484, + "num_tokens": 287226394.0, + "reward": 0.3515625, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 8.140038517012727e-06, + "sampling/sampling_logp_difference/max": 11.71871566772461, + "sampling/sampling_logp_difference/mean": 0.01937047764658928, + "step": 349 + }, + { + "clip_ratio/high_max": 8.395007171202451e-06, + "clip_ratio/high_mean": 2.0987517928006127e-06, + "clip_ratio/low_mean": 3.610323426528339e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.820198628545768e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12561.0, + "completions/mean_length": 5387.546875, + "completions/mean_terminated_length": 5300.96044921875, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.95712860673666, + "epoch": 0.3219871205151794, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.004228786565363407, + "learning_rate": 1e-05, + "loss": 0.0692, + "num_tokens": 287935952.0, + "reward": 0.5234375, + "reward_std": 0.29378965497016907, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000360012054443, + "sampling/importance_sampling_ratio/min": 0.005966294556856155, + "sampling/sampling_logp_difference/max": 5.121629238128662, + "sampling/sampling_logp_difference/mean": 0.020441649481654167, + "step": 350 + }, + { + "clip_ratio/high_max": 1.2559637070808094e-05, + "clip_ratio/high_mean": 3.1399092677020235e-06, + "clip_ratio/low_mean": 2.673440690159623e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9874316624045605e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15957.0, + "completions/mean_length": 5799.625, + "completions/mean_terminated_length": 5716.283203125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "entropy": 0.9457403644919395, + "epoch": 0.32290708371665133, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0029834613669663668, + "learning_rate": 1e-05, + "loss": 0.0589, + "num_tokens": 288696000.0, + "reward": 0.4921875, + "reward_std": 0.3884710967540741, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999950528144836, + "sampling/importance_sampling_ratio/min": 0.0011352180736139417, + "sampling/sampling_logp_difference/max": 6.780930519104004, + "sampling/sampling_logp_difference/mean": 0.021189026534557343, + "step": 351 + }, + { + "clip_ratio/high_max": 6.2518756749341264e-06, + "clip_ratio/high_mean": 1.5629689187335316e-06, + "clip_ratio/low_mean": 3.849920358334202e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0062172047328204e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16148.0, + "completions/mean_length": 7058.6875, + "completions/mean_terminated_length": 6757.87060546875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "entropy": 0.8782663866877556, + "epoch": 0.32382704691812325, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002617151942104101, + "learning_rate": 1e-05, + "loss": 0.0874, + "num_tokens": 289618904.0, + "reward": 0.3515625, + "reward_std": 0.28353992104530334, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999942779541016, + "sampling/importance_sampling_ratio/min": 0.001438659499399364, + "sampling/sampling_logp_difference/max": 6.54404354095459, + "sampling/sampling_logp_difference/mean": 0.019699860364198685, + "step": 352 + }, + { + "clip_ratio/high_max": 1.8079134861181956e-05, + "clip_ratio/high_mean": 4.519783715295489e-06, + "clip_ratio/low_mean": 6.639697721766424e-05, + "clip_ratio/low_min": 1.0295151696482208e-05, + "clip_ratio/region_mean": 7.091676206982811e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15519.0, + "completions/mean_length": 6609.953125, + "completions/mean_terminated_length": 6454.81005859375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.8895087689161301, + "epoch": 0.3247470101195952, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0021503251045942307, + "learning_rate": 1e-05, + "loss": 0.044, + "num_tokens": 290484378.0, + "reward": 0.3671875, + "reward_std": 0.35324612259864807, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 5.448641240946017e-05, + "sampling/sampling_logp_difference/max": 9.817559242248535, + "sampling/sampling_logp_difference/mean": 0.0200796015560627, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 6.141278026916552e-05, + "clip_ratio/low_min": 1.333249815616e-05, + "clip_ratio/region_mean": 6.141278026916552e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16284.0, + "completions/mean_length": 7872.4921875, + "completions/mean_terminated_length": 7453.89306640625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.9183534607291222, + "epoch": 0.32566697332106714, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0023925534915179014, + "learning_rate": 1e-05, + "loss": 0.0895, + "num_tokens": 291512393.0, + "reward": 0.34375, + "reward_std": 0.3763991594314575, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999232292175293, + "sampling/importance_sampling_ratio/min": 0.0004287353658583015, + "sampling/sampling_logp_difference/max": 7.7546706199646, + "sampling/sampling_logp_difference/mean": 0.020358648151159286, + "step": 354 + }, + { + "clip_ratio/high_max": 1.0912609013757901e-05, + "clip_ratio/high_mean": 3.7178592720010784e-06, + "clip_ratio/low_mean": 1.995230707052542e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.367016588777915e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15485.0, + "completions/mean_length": 6605.6640625, + "completions/mean_terminated_length": 6290.23388671875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.9602678120136261, + "epoch": 0.3265869365225391, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018709113355726004, + "learning_rate": 1e-05, + "loss": 0.0642, + "num_tokens": 292380390.0, + "reward": 0.515625, + "reward_std": 0.26303553581237793, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999732375144958, + "sampling/importance_sampling_ratio/min": 6.221406168016586e-10, + "sampling/sampling_logp_difference/max": 21.19785499572754, + "sampling/sampling_logp_difference/mean": 0.02150166593492031, + "step": 355 + }, + { + "clip_ratio/high_max": 2.202200403189636e-05, + "clip_ratio/high_mean": 6.279054105107207e-06, + "clip_ratio/low_mean": 5.168271604816255e-05, + "clip_ratio/low_min": 7.731559890089557e-06, + "clip_ratio/region_mean": 5.796177038064343e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 13477.0, + "completions/max_terminated_length": 13477.0, + "completions/mean_length": 6677.8828125, + "completions/mean_terminated_length": 6677.8828125, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "entropy": 1.001693107187748, + "epoch": 0.32750689972401104, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0017649955116212368, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 293255287.0, + "reward": 0.3203125, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998878240585327, + "sampling/importance_sampling_ratio/min": 0.0027159738820046186, + "sampling/sampling_logp_difference/max": 5.908604621887207, + "sampling/sampling_logp_difference/mean": 0.020375655964016914, + "step": 356 + }, + { + "clip_ratio/high_max": 5.7686097534315195e-06, + "clip_ratio/high_mean": 2.223324372607749e-06, + "clip_ratio/low_mean": 2.7612236522145395e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9835560894753144e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6210.6953125, + "completions/mean_terminated_length": 6049.21484375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.9842480793595314, + "epoch": 0.32842686292548295, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024816791992634535, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 294069184.0, + "reward": 0.4140625, + "reward_std": 0.2845958471298218, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000953674316406, + "sampling/importance_sampling_ratio/min": 0.0047831060364842415, + "sampling/sampling_logp_difference/max": 5.342665195465088, + "sampling/sampling_logp_difference/mean": 0.021009165793657303, + "step": 357 + }, + { + "clip_ratio/high_max": 5.0844009820139036e-06, + "clip_ratio/high_mean": 1.2711002455034759e-06, + "clip_ratio/low_mean": 4.299241186345171e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.426351074471313e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16085.0, + "completions/mean_length": 6876.0546875, + "completions/mean_terminated_length": 6725.13525390625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.8680268228054047, + "epoch": 0.32934682612695493, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030787813011556864, + "learning_rate": 1e-05, + "loss": 0.1096, + "num_tokens": 294969111.0, + "reward": 0.4921875, + "reward_std": 0.3514111638069153, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999387264251709, + "sampling/importance_sampling_ratio/min": 0.0036591701209545135, + "sampling/sampling_logp_difference/max": 5.610518932342529, + "sampling/sampling_logp_difference/mean": 0.019419874995946884, + "step": 358 + }, + { + "clip_ratio/high_max": 5.279830929794116e-06, + "clip_ratio/high_mean": 1.319957732448529e-06, + "clip_ratio/low_mean": 3.3445195754211454e-05, + "clip_ratio/low_min": 3.1955414669937454e-06, + "clip_ratio/region_mean": 3.476515314559947e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16154.0, + "completions/mean_length": 7079.7734375, + "completions/mean_terminated_length": 6932.087890625, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 1.0033101588487625, + "epoch": 0.33026678932842685, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027940638829022646, + "learning_rate": 1e-05, + "loss": 0.1352, + "num_tokens": 295894682.0, + "reward": 0.4140625, + "reward_std": 0.40319663286209106, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999890923500061, + "sampling/importance_sampling_ratio/min": 0.00033553718822076917, + "sampling/sampling_logp_difference/max": 7.999777793884277, + "sampling/sampling_logp_difference/mean": 0.021608728915452957, + "step": 359 + }, + { + "clip_ratio/high_max": 4.0542295209888835e-06, + "clip_ratio/high_mean": 1.0135573802472209e-06, + "clip_ratio/low_mean": 3.935158406420669e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.0365141785514425e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14374.0, + "completions/mean_length": 6487.421875, + "completions/mean_terminated_length": 6249.904296875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.9404204189777374, + "epoch": 0.3311867525298988, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021709369029849768, + "learning_rate": 1e-05, + "loss": 0.0479, + "num_tokens": 296744216.0, + "reward": 0.4296875, + "reward_std": 0.31800350546836853, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000174045562744, + "sampling/importance_sampling_ratio/min": 0.00012341697583906353, + "sampling/sampling_logp_difference/max": 8.9999418258667, + "sampling/sampling_logp_difference/mean": 0.02024281956255436, + "step": 360 + }, + { + "clip_ratio/high_max": 2.4414162908215076e-05, + "clip_ratio/high_mean": 6.103540727053769e-06, + "clip_ratio/low_mean": 2.0490186102506414e-05, + "clip_ratio/low_min": 2.8498473056970397e-06, + "clip_ratio/region_mean": 2.6593726602186507e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14584.0, + "completions/mean_length": 6668.1953125, + "completions/mean_terminated_length": 6273.24365234375, + "completions/min_length": 567.0, + "completions/min_terminated_length": 567.0, + "entropy": 0.8671490699052811, + "epoch": 0.33210671573137074, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018110686214640737, + "learning_rate": 1e-05, + "loss": -0.0018, + "num_tokens": 297617937.0, + "reward": 0.4765625, + "reward_std": 0.22673210501670837, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999983549118042, + "sampling/importance_sampling_ratio/min": 0.0003801324055530131, + "sampling/sampling_logp_difference/max": 7.874990940093994, + "sampling/sampling_logp_difference/mean": 0.01934785582125187, + "step": 361 + }, + { + "clip_ratio/high_max": 8.66071218297293e-06, + "clip_ratio/high_mean": 2.1651780457432324e-06, + "clip_ratio/low_mean": 2.4539695857583865e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6704873903327098e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15979.0, + "completions/mean_length": 8579.9921875, + "completions/mean_terminated_length": 7989.7734375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 1.0337364450097084, + "epoch": 0.3330266789328427, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014365602983161807, + "learning_rate": 1e-05, + "loss": 0.045, + "num_tokens": 298736304.0, + "reward": 0.1953125, + "reward_std": 0.1999218761920929, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999146461486816, + "sampling/importance_sampling_ratio/min": 0.0014037116197869182, + "sampling/sampling_logp_difference/max": 6.5686354637146, + "sampling/sampling_logp_difference/mean": 0.021067796275019646, + "step": 362 + }, + { + "clip_ratio/high_max": 7.748803682261496e-06, + "clip_ratio/high_mean": 1.937200920565374e-06, + "clip_ratio/low_mean": 5.063434127805522e-05, + "clip_ratio/low_min": 9.66116931522265e-06, + "clip_ratio/region_mean": 5.257154271021136e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16368.0, + "completions/mean_length": 7000.8203125, + "completions/mean_terminated_length": 6926.93701171875, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.8918163478374481, + "epoch": 0.33394664213431463, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003008107887580991, + "learning_rate": 1e-05, + "loss": 0.0862, + "num_tokens": 299653249.0, + "reward": 0.453125, + "reward_std": 0.3322049677371979, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999739527702332, + "sampling/importance_sampling_ratio/min": 0.002478980226442218, + "sampling/sampling_logp_difference/max": 5.999907970428467, + "sampling/sampling_logp_difference/mean": 0.020022090524435043, + "step": 363 + }, + { + "clip_ratio/high_max": 1.5043352505017538e-05, + "clip_ratio/high_mean": 3.7608381262543844e-06, + "clip_ratio/low_mean": 8.800596447144926e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.2561434687086148e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16375.0, + "completions/max_terminated_length": 16375.0, + "completions/mean_length": 7319.578125, + "completions/mean_terminated_length": 7319.578125, + "completions/min_length": 1974.0, + "completions/min_terminated_length": 1974.0, + "entropy": 0.9145128801465034, + "epoch": 0.33486660533578655, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.0010370119707658887, + "learning_rate": 1e-05, + "loss": 0.0138, + "num_tokens": 300608099.0, + "reward": 0.4609375, + "reward_std": 0.1412346363067627, + "rewards/accuracy_reward/mean": 0.4609375, + "rewards/accuracy_reward/std": 0.5004304051399231, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999158382415771, + "sampling/importance_sampling_ratio/min": 0.00012156071898061782, + "sampling/sampling_logp_difference/max": 9.015096664428711, + "sampling/sampling_logp_difference/mean": 0.019386455416679382, + "step": 364 + }, + { + "clip_ratio/high_max": 9.589830597178661e-06, + "clip_ratio/high_mean": 2.3974576492946653e-06, + "clip_ratio/low_mean": 2.2494899667435675e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4892357714634272e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16316.0, + "completions/mean_length": 6956.90625, + "completions/mean_terminated_length": 6882.67724609375, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.9679212644696236, + "epoch": 0.3357865685372585, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021569218952208757, + "learning_rate": 1e-05, + "loss": 0.0621, + "num_tokens": 301516535.0, + "reward": 0.4765625, + "reward_std": 0.23462772369384766, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.01621459797024727, + "sampling/sampling_logp_difference/max": 4.121843338012695, + "sampling/sampling_logp_difference/mean": 0.020638462156057358, + "step": 365 + }, + { + "clip_ratio/high_max": 1.1957331025769236e-05, + "clip_ratio/high_mean": 2.989332756442309e-06, + "clip_ratio/low_mean": 2.334770033485256e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6337033204981708e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16343.0, + "completions/mean_length": 6933.1953125, + "completions/mean_terminated_length": 6706.37646484375, + "completions/min_length": 979.0, + "completions/min_terminated_length": 979.0, + "entropy": 0.9610472694039345, + "epoch": 0.33670653173873044, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019900640472769737, + "learning_rate": 1e-05, + "loss": 0.0329, + "num_tokens": 302422120.0, + "reward": 0.4921875, + "reward_std": 0.22908620536327362, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999517202377319, + "sampling/importance_sampling_ratio/min": 7.346414143682978e-09, + "sampling/sampling_logp_difference/max": 18.729053497314453, + "sampling/sampling_logp_difference/mean": 0.020782412961125374, + "step": 366 + }, + { + "clip_ratio/high_max": 1.6365190958822495e-05, + "clip_ratio/high_mean": 4.091297739705624e-06, + "clip_ratio/low_mean": 2.5385876426753384e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9477173825398495e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15799.0, + "completions/max_terminated_length": 15799.0, + "completions/mean_length": 6711.640625, + "completions/mean_terminated_length": 6711.640625, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "entropy": 0.8035724982619286, + "epoch": 0.3376264949402024, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001954294042661786, + "learning_rate": 1e-05, + "loss": 0.0264, + "num_tokens": 303299402.0, + "reward": 0.4765625, + "reward_std": 0.2856517732143402, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000114440917969, + "sampling/importance_sampling_ratio/min": 0.002623806707561016, + "sampling/sampling_logp_difference/max": 5.943129062652588, + "sampling/sampling_logp_difference/mean": 0.018188728019595146, + "step": 367 + }, + { + "clip_ratio/high_max": 8.633360948806512e-06, + "clip_ratio/high_mean": 2.158340237201628e-06, + "clip_ratio/low_mean": 3.7187305906627444e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9345645916455396e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15980.0, + "completions/mean_length": 6977.890625, + "completions/mean_terminated_length": 6674.4677734375, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "entropy": 0.9545647650957108, + "epoch": 0.33854645814167433, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022571857552975416, + "learning_rate": 1e-05, + "loss": 0.0187, + "num_tokens": 304210412.0, + "reward": 0.4375, + "reward_std": 0.19568344950675964, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999645948410034, + "sampling/importance_sampling_ratio/min": 5.501153282239102e-06, + "sampling/sampling_logp_difference/max": 12.110552787780762, + "sampling/sampling_logp_difference/mean": 0.021196123212575912, + "step": 368 + }, + { + "clip_ratio/high_max": 1.2197504474897869e-05, + "clip_ratio/high_mean": 3.0493761187244672e-06, + "clip_ratio/low_mean": 2.7975384682576987e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1024760801301454e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16087.0, + "completions/mean_length": 5952.8359375, + "completions/mean_terminated_length": 5349.3798828125, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.846152663230896, + "epoch": 0.33946642134314625, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003541936632245779, + "learning_rate": 1e-05, + "loss": 0.0897, + "num_tokens": 304989015.0, + "reward": 0.4453125, + "reward_std": 0.3022122383117676, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998842477798462, + "sampling/importance_sampling_ratio/min": 0.0019083521328866482, + "sampling/sampling_logp_difference/max": 6.261515140533447, + "sampling/sampling_logp_difference/mean": 0.018978029489517212, + "step": 369 + }, + { + "clip_ratio/high_max": 1.1725882586688385e-05, + "clip_ratio/high_mean": 2.9314706466720963e-06, + "clip_ratio/low_mean": 6.290217379500973e-05, + "clip_ratio/low_min": 1.226112590302364e-05, + "clip_ratio/region_mean": 6.583364438483841e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16098.0, + "completions/mean_length": 7976.9296875, + "completions/mean_terminated_length": 7635.1787109375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.9827005565166473, + "epoch": 0.3403863845446182, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023713603150099516, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 306032054.0, + "reward": 0.3046875, + "reward_std": 0.2527809143066406, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 3.2563195873080986e-07, + "sampling/sampling_logp_difference/max": 14.937498092651367, + "sampling/sampling_logp_difference/mean": 0.0217706598341465, + "step": 370 + }, + { + "clip_ratio/high_max": 2.3902987095425487e-05, + "clip_ratio/high_mean": 7.721868257704045e-06, + "clip_ratio/low_mean": 4.01184702241153e-05, + "clip_ratio/low_min": 1.341508686891757e-05, + "clip_ratio/region_mean": 4.784033922078379e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16369.0, + "completions/mean_length": 7117.8828125, + "completions/mean_terminated_length": 6895.49609375, + "completions/min_length": 1314.0, + "completions/min_terminated_length": 1314.0, + "entropy": 0.8897347301244736, + "epoch": 0.34130634774609014, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0023132229689508677, + "learning_rate": 1e-05, + "loss": 0.162, + "num_tokens": 306960599.0, + "reward": 0.515625, + "reward_std": 0.34822866320610046, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999181032180786, + "sampling/importance_sampling_ratio/min": 0.0007341355667449534, + "sampling/sampling_logp_difference/max": 7.2168169021606445, + "sampling/sampling_logp_difference/mean": 0.018669119104743004, + "step": 371 + }, + { + "clip_ratio/high_max": 4.371240720502101e-06, + "clip_ratio/high_mean": 1.0928101801255252e-06, + "clip_ratio/low_mean": 4.9660218792269006e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.075302897239453e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15995.0, + "completions/mean_length": 6971.0390625, + "completions/mean_terminated_length": 6745.12841796875, + "completions/min_length": 871.0, + "completions/min_terminated_length": 871.0, + "entropy": 1.0919678956270218, + "epoch": 0.3422263109475621, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0030236958991736174, + "learning_rate": 1e-05, + "loss": 0.0754, + "num_tokens": 307873100.0, + "reward": 0.3359375, + "reward_std": 0.34245961904525757, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000091791152954, + "sampling/importance_sampling_ratio/min": 0.01082979142665863, + "sampling/sampling_logp_difference/max": 4.525454521179199, + "sampling/sampling_logp_difference/mean": 0.022024717181921005, + "step": 372 + }, + { + "clip_ratio/high_max": 4.341634394222638e-06, + "clip_ratio/high_mean": 1.0854085985556594e-06, + "clip_ratio/low_mean": 3.061858558339736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.170399429563986e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14442.0, + "completions/mean_length": 7120.0, + "completions/mean_terminated_length": 6897.66455078125, + "completions/min_length": 1685.0, + "completions/min_terminated_length": 1685.0, + "entropy": 1.0812252908945084, + "epoch": 0.34314627414903404, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0018919071881100535, + "learning_rate": 1e-05, + "loss": 0.0542, + "num_tokens": 308804876.0, + "reward": 0.28125, + "reward_std": 0.2522490322589874, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999696612358093, + "sampling/importance_sampling_ratio/min": 0.0011743507348001003, + "sampling/sampling_logp_difference/max": 6.747039794921875, + "sampling/sampling_logp_difference/mean": 0.022177904844284058, + "step": 373 + }, + { + "clip_ratio/high_max": 4.6198765630833805e-06, + "clip_ratio/high_mean": 1.1549691407708451e-06, + "clip_ratio/low_mean": 1.3996559573570266e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.5151528714341111e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15814.0, + "completions/mean_length": 7344.5546875, + "completions/mean_terminated_length": 6977.09716796875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.9340410158038139, + "epoch": 0.34406623735050595, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001848200336098671, + "learning_rate": 1e-05, + "loss": 0.0195, + "num_tokens": 309762603.0, + "reward": 0.4296875, + "reward_std": 0.2188364714384079, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999948143959045, + "sampling/importance_sampling_ratio/min": 0.0002964614541269839, + "sampling/sampling_logp_difference/max": 8.1235933303833, + "sampling/sampling_logp_difference/mean": 0.02034556306898594, + "step": 374 + }, + { + "clip_ratio/high_max": 1.3913735983805964e-05, + "clip_ratio/high_mean": 3.478433995951491e-06, + "clip_ratio/low_mean": 2.4544106395296694e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8022539936500834e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15244.0, + "completions/max_terminated_length": 15244.0, + "completions/mean_length": 6615.6484375, + "completions/mean_terminated_length": 6615.6484375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.971637412905693, + "epoch": 0.34498620055197793, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0012123315827921033, + "learning_rate": 1e-05, + "loss": 0.0581, + "num_tokens": 310628230.0, + "reward": 0.4296875, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999868869781494, + "sampling/importance_sampling_ratio/min": 2.587145718280226e-05, + "sampling/sampling_logp_difference/max": 10.562370300292969, + "sampling/sampling_logp_difference/mean": 0.020877305418252945, + "step": 375 + }, + { + "clip_ratio/high_max": 6.119951194705209e-06, + "clip_ratio/high_mean": 1.5299877986763022e-06, + "clip_ratio/low_mean": 4.789722436271404e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.942721272982453e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16028.0, + "completions/mean_length": 6333.84375, + "completions/mean_terminated_length": 6009.64501953125, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.9569023698568344, + "epoch": 0.34590616375344985, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002646032487973571, + "learning_rate": 1e-05, + "loss": 0.086, + "num_tokens": 311457466.0, + "reward": 0.4453125, + "reward_std": 0.34928950667381287, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000247955322266, + "sampling/importance_sampling_ratio/min": 0.022760435938835144, + "sampling/sampling_logp_difference/max": 3.782731533050537, + "sampling/sampling_logp_difference/mean": 0.020464638248085976, + "step": 376 + }, + { + "clip_ratio/high_max": 1.8126566374121467e-05, + "clip_ratio/high_mean": 4.531641593530367e-06, + "clip_ratio/low_mean": 4.1024483266483e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5556124632639694e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15719.0, + "completions/mean_length": 6657.8515625, + "completions/mean_terminated_length": 6503.46875, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "entropy": 1.029910758137703, + "epoch": 0.3468261269549218, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021437006071209908, + "learning_rate": 1e-05, + "loss": -0.0212, + "num_tokens": 312330879.0, + "reward": 0.4453125, + "reward_std": 0.25354230403900146, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000024437904358, + "sampling/importance_sampling_ratio/min": 0.020200612023472786, + "sampling/sampling_logp_difference/max": 3.9020423889160156, + "sampling/sampling_logp_difference/mean": 0.021411258727312088, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7961265118392475e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7961265118392475e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16311.0, + "completions/mean_length": 7657.8359375, + "completions/mean_terminated_length": 7376.3466796875, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 0.9699486121535301, + "epoch": 0.34774609015639374, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0018965511117130518, + "learning_rate": 1e-05, + "loss": 0.066, + "num_tokens": 313331898.0, + "reward": 0.3515625, + "reward_std": 0.18884865939617157, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 7.867415661166888e-06, + "sampling/sampling_logp_difference/max": 11.75278091430664, + "sampling/sampling_logp_difference/mean": 0.021029409021139145, + "step": 378 + }, + { + "clip_ratio/high_max": 7.721664815107943e-06, + "clip_ratio/high_mean": 2.7168170504410227e-06, + "clip_ratio/low_mean": 4.313065619498957e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.584747375702136e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14619.0, + "completions/mean_length": 7085.3671875, + "completions/mean_terminated_length": 6937.77001953125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 1.0943557620048523, + "epoch": 0.3486660533578657, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016498853219673038, + "learning_rate": 1e-05, + "loss": 0.0346, + "num_tokens": 314258601.0, + "reward": 0.3203125, + "reward_std": 0.24329257011413574, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000105857849121, + "sampling/importance_sampling_ratio/min": 0.03447282314300537, + "sampling/sampling_logp_difference/max": 3.367583990097046, + "sampling/sampling_logp_difference/mean": 0.021414825692772865, + "step": 379 + }, + { + "clip_ratio/high_max": 7.953489330247976e-06, + "clip_ratio/high_mean": 1.988372332561994e-06, + "clip_ratio/low_mean": 3.479703536868328e-05, + "clip_ratio/low_min": 2.6767741019284585e-06, + "clip_ratio/region_mean": 3.6785407701245276e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15631.0, + "completions/mean_length": 7614.1171875, + "completions/mean_terminated_length": 7182.81103515625, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "entropy": 0.9673903658986092, + "epoch": 0.34958601655933763, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001364902127534151, + "learning_rate": 1e-05, + "loss": 0.0041, + "num_tokens": 315256840.0, + "reward": 0.4296875, + "reward_std": 0.3503454327583313, + "rewards/accuracy_reward/mean": 0.4296875, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 6.874255632283166e-05, + "sampling/sampling_logp_difference/max": 9.585142135620117, + "sampling/sampling_logp_difference/mean": 0.02000460773706436, + "step": 380 + }, + { + "clip_ratio/high_max": 6.980824764468707e-06, + "clip_ratio/high_mean": 1.7452061911171768e-06, + "clip_ratio/low_mean": 4.410173994529032e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.5846945681660145e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15637.0, + "completions/mean_length": 7766.9375, + "completions/mean_terminated_length": 7630.1591796875, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 1.0277370810508728, + "epoch": 0.35050597976080955, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002171436557546258, + "learning_rate": 1e-05, + "loss": 0.0705, + "num_tokens": 316268976.0, + "reward": 0.34375, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999735951423645, + "sampling/importance_sampling_ratio/min": 7.485197420464829e-05, + "sampling/sampling_logp_difference/max": 9.499998092651367, + "sampling/sampling_logp_difference/mean": 0.021251089870929718, + "step": 381 + }, + { + "clip_ratio/high_max": 9.843256520980503e-06, + "clip_ratio/high_mean": 3.5061395919910865e-06, + "clip_ratio/low_mean": 3.973216325903195e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.323830307839671e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15752.0, + "completions/mean_length": 7774.2265625, + "completions/mean_terminated_length": 7567.59228515625, + "completions/min_length": 595.0, + "completions/min_terminated_length": 595.0, + "entropy": 1.0064171329140663, + "epoch": 0.3514259429622815, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0013348929351195693, + "learning_rate": 1e-05, + "loss": 0.0336, + "num_tokens": 317285677.0, + "reward": 0.28125, + "reward_std": 0.23934084177017212, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999904632568359, + "sampling/importance_sampling_ratio/min": 1.7632934259381727e-06, + "sampling/sampling_logp_difference/max": 13.248327255249023, + "sampling/sampling_logp_difference/mean": 0.022232960909605026, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.2021426648043416e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.2021426648043416e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16208.0, + "completions/mean_length": 6547.1796875, + "completions/mean_terminated_length": 6469.724609375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.9192209765315056, + "epoch": 0.35234590616375344, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002925361506640911, + "learning_rate": 1e-05, + "loss": 0.0809, + "num_tokens": 318148276.0, + "reward": 0.515625, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999691843986511, + "sampling/importance_sampling_ratio/min": 7.411971182591515e-06, + "sampling/sampling_logp_difference/max": 11.812414169311523, + "sampling/sampling_logp_difference/mean": 0.020470617339015007, + "step": 383 + }, + { + "clip_ratio/high_max": 1.543848429719219e-05, + "clip_ratio/high_mean": 3.8596210742980475e-06, + "clip_ratio/low_mean": 2.0332364726982632e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4191985573907004e-05, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15923.0, + "completions/mean_length": 6904.40625, + "completions/mean_terminated_length": 6101.05078125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "entropy": 0.9611739367246628, + "epoch": 0.3532658693652254, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002288331277668476, + "learning_rate": 1e-05, + "loss": 0.0388, + "num_tokens": 319052224.0, + "reward": 0.390625, + "reward_std": 0.23645779490470886, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999583959579468, + "sampling/importance_sampling_ratio/min": 1.0906596799031831e-05, + "sampling/sampling_logp_difference/max": 11.426142692565918, + "sampling/sampling_logp_difference/mean": 0.02049478143453598, + "step": 384 + }, + { + "clip_ratio/high_max": 1.0430391284899088e-05, + "clip_ratio/high_mean": 3.662984454422258e-06, + "clip_ratio/low_mean": 3.791802066643868e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.158100534823461e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16195.0, + "completions/mean_length": 7632.359375, + "completions/mean_terminated_length": 7350.04833984375, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 1.0255606770515442, + "epoch": 0.35418583256669733, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015405584126710892, + "learning_rate": 1e-05, + "loss": 0.111, + "num_tokens": 320051534.0, + "reward": 0.3515625, + "reward_std": 0.30327799916267395, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000008344650269, + "sampling/importance_sampling_ratio/min": 0.00014919505338184536, + "sampling/sampling_logp_difference/max": 8.810256004333496, + "sampling/sampling_logp_difference/mean": 0.021682340651750565, + "step": 385 + }, + { + "clip_ratio/high_max": 1.10081018647179e-05, + "clip_ratio/high_mean": 2.752025466179475e-06, + "clip_ratio/low_mean": 2.2116193804322393e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4868219043128192e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14348.0, + "completions/mean_length": 6260.8828125, + "completions/mean_terminated_length": 6100.19873046875, + "completions/min_length": 1371.0, + "completions/min_terminated_length": 1371.0, + "entropy": 0.7945073395967484, + "epoch": 0.35510579576816925, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00142462900839746, + "learning_rate": 1e-05, + "loss": 0.0707, + "num_tokens": 320872143.0, + "reward": 0.53125, + "reward_std": 0.18990948796272278, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999794960021973, + "sampling/importance_sampling_ratio/min": 0.0003461402375251055, + "sampling/sampling_logp_difference/max": 7.9686665534973145, + "sampling/sampling_logp_difference/mean": 0.018331468105316162, + "step": 386 + }, + { + "clip_ratio/high_max": 8.952299140219111e-06, + "clip_ratio/high_mean": 2.2380747850547777e-06, + "clip_ratio/low_mean": 2.7251681331108557e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9489756570910686e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15807.0, + "completions/mean_length": 7761.375, + "completions/mean_terminated_length": 7693.48046875, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "entropy": 1.0799954682588577, + "epoch": 0.3560257589696412, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0019111793953925371, + "learning_rate": 1e-05, + "loss": 0.0527, + "num_tokens": 321885447.0, + "reward": 0.390625, + "reward_std": 0.2806519567966461, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999994039535522, + "sampling/importance_sampling_ratio/min": 0.00028313760412856936, + "sampling/sampling_logp_difference/max": 8.169577598571777, + "sampling/sampling_logp_difference/mean": 0.02205459028482437, + "step": 387 + }, + { + "clip_ratio/high_max": 1.6241773209912935e-05, + "clip_ratio/high_mean": 5.09954668359569e-06, + "clip_ratio/low_mean": 4.549925756691664e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.0598803454704466e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15808.0, + "completions/mean_length": 7294.796875, + "completions/mean_terminated_length": 7076.65625, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.8159547671675682, + "epoch": 0.35694572217111314, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.001482969499193132, + "learning_rate": 1e-05, + "loss": 0.0502, + "num_tokens": 322838797.0, + "reward": 0.5234375, + "reward_std": 0.36007601022720337, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999374151229858, + "sampling/importance_sampling_ratio/min": 4.2558355062283226e-07, + "sampling/sampling_logp_difference/max": 14.669804573059082, + "sampling/sampling_logp_difference/mean": 0.01850103959441185, + "step": 388 + }, + { + "clip_ratio/high_max": 4.2527130972302984e-06, + "clip_ratio/high_mean": 1.7856882550404407e-06, + "clip_ratio/low_mean": 2.875013205994037e-05, + "clip_ratio/low_min": 3.824852228717646e-06, + "clip_ratio/region_mean": 3.053582031498081e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15897.0, + "completions/mean_length": 7229.875, + "completions/mean_terminated_length": 6779.671875, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "entropy": 0.9420096501708031, + "epoch": 0.3578656853725851, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001755179837346077, + "learning_rate": 1e-05, + "loss": 0.075, + "num_tokens": 323782333.0, + "reward": 0.3984375, + "reward_std": 0.24541424214839935, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999120831489563, + "sampling/importance_sampling_ratio/min": 7.437798922182992e-05, + "sampling/sampling_logp_difference/max": 9.50635051727295, + "sampling/sampling_logp_difference/mean": 0.02008935809135437, + "step": 389 + }, + { + "clip_ratio/high_max": 8.81059531820938e-06, + "clip_ratio/high_mean": 2.202648829552345e-06, + "clip_ratio/low_mean": 2.0493020770118164e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.269566959967051e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16016.0, + "completions/mean_length": 6628.8359375, + "completions/mean_terminated_length": 6473.99267578125, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 1.0327190533280373, + "epoch": 0.35878564857405704, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.00202886201441288, + "learning_rate": 1e-05, + "loss": 0.0549, + "num_tokens": 324648848.0, + "reward": 0.421875, + "reward_std": 0.15650184452533722, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999722242355347, + "sampling/importance_sampling_ratio/min": 0.028374243527650833, + "sampling/sampling_logp_difference/max": 3.5622735023498535, + "sampling/sampling_logp_difference/mean": 0.021120186895132065, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.5018343005031056e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.5018343005031056e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15405.0, + "completions/mean_length": 7402.4140625, + "completions/mean_terminated_length": 7259.849609375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 1.0335597470402718, + "epoch": 0.35970561177552896, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0026126320008188486, + "learning_rate": 1e-05, + "loss": 0.0271, + "num_tokens": 325617965.0, + "reward": 0.328125, + "reward_std": 0.21436560153961182, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000274181365967, + "sampling/importance_sampling_ratio/min": 0.002047094516456127, + "sampling/sampling_logp_difference/max": 6.191333770751953, + "sampling/sampling_logp_difference/mean": 0.021358007565140724, + "step": 391 + }, + { + "clip_ratio/high_max": 1.7713674878905294e-05, + "clip_ratio/high_mean": 5.139017389410583e-06, + "clip_ratio/low_mean": 4.4972417526878417e-05, + "clip_ratio/low_min": 8.263916242867708e-06, + "clip_ratio/region_mean": 5.0111435712096863e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16246.0, + "completions/mean_length": 7414.046875, + "completions/mean_terminated_length": 7124.693359375, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 1.043906107544899, + "epoch": 0.36062557497700093, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004497586749494076, + "learning_rate": 1e-05, + "loss": 0.0558, + "num_tokens": 326583819.0, + "reward": 0.4140625, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999969005584717, + "sampling/importance_sampling_ratio/min": 0.0015032986411824822, + "sampling/sampling_logp_difference/max": 6.500093460083008, + "sampling/sampling_logp_difference/mean": 0.021614551544189453, + "step": 392 + }, + { + "clip_ratio/high_max": 2.2412414182326756e-05, + "clip_ratio/high_mean": 5.603103545581689e-06, + "clip_ratio/low_mean": 2.0601042535872693e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.620414619514122e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14288.0, + "completions/max_terminated_length": 14288.0, + "completions/mean_length": 7090.5, + "completions/mean_terminated_length": 7090.5, + "completions/min_length": 1183.0, + "completions/min_terminated_length": 1183.0, + "entropy": 0.9755794927477837, + "epoch": 0.36154553817847285, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026554155629128218, + "learning_rate": 1e-05, + "loss": 0.0468, + "num_tokens": 327512315.0, + "reward": 0.53125, + "reward_std": 0.27722427248954773, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999885618686676, + "sampling/importance_sampling_ratio/min": 7.104578980943188e-05, + "sampling/sampling_logp_difference/max": 9.552186012268066, + "sampling/sampling_logp_difference/mean": 0.020926889032125473, + "step": 393 + }, + { + "clip_ratio/high_max": 3.259367531427415e-06, + "clip_ratio/high_mean": 1.5600960523443064e-06, + "clip_ratio/low_mean": 3.035687961983058e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.191697578586172e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15710.0, + "completions/mean_length": 7200.140625, + "completions/mean_terminated_length": 7127.82666015625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.9084664657711983, + "epoch": 0.3624655013799448, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018455780809745193, + "learning_rate": 1e-05, + "loss": -0.0024, + "num_tokens": 328454269.0, + "reward": 0.3828125, + "reward_std": 0.2301519364118576, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999072551727295, + "sampling/importance_sampling_ratio/min": 0.00033894419902935624, + "sampling/sampling_logp_difference/max": 7.989675045013428, + "sampling/sampling_logp_difference/mean": 0.01939154416322708, + "step": 394 + }, + { + "clip_ratio/high_max": 1.0260662747896276e-05, + "clip_ratio/high_mean": 2.565165686974069e-06, + "clip_ratio/low_mean": 3.0616293088314706e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.318145900266245e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 6977.5234375, + "completions/mean_terminated_length": 6674.08837890625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.9352559298276901, + "epoch": 0.36338546458141674, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021165197249501944, + "learning_rate": 1e-05, + "loss": 0.06, + "num_tokens": 329366400.0, + "reward": 0.4453125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000401735305786, + "sampling/importance_sampling_ratio/min": 0.034073151648044586, + "sampling/sampling_logp_difference/max": 3.3792455196380615, + "sampling/sampling_logp_difference/mean": 0.020020857453346252, + "step": 395 + }, + { + "clip_ratio/high_max": 2.777207805593207e-05, + "clip_ratio/high_mean": 6.9430195139830175e-06, + "clip_ratio/low_mean": 4.1006693436429487e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.794971300725592e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16230.0, + "completions/mean_length": 7819.828125, + "completions/mean_terminated_length": 7398.63916015625, + "completions/min_length": 1273.0, + "completions/min_terminated_length": 1273.0, + "entropy": 1.0045175030827522, + "epoch": 0.36430542778288866, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022391832899302244, + "learning_rate": 1e-05, + "loss": 0.1424, + "num_tokens": 330386442.0, + "reward": 0.453125, + "reward_std": 0.29302334785461426, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999501705169678, + "sampling/importance_sampling_ratio/min": 0.0002908352471422404, + "sampling/sampling_logp_difference/max": 8.142753601074219, + "sampling/sampling_logp_difference/mean": 0.021083837375044823, + "step": 396 + }, + { + "clip_ratio/high_max": 8.042205081437714e-06, + "clip_ratio/high_mean": 2.0105512703594286e-06, + "clip_ratio/low_mean": 3.623322004386864e-05, + "clip_ratio/low_min": 5.5314631026703864e-06, + "clip_ratio/region_mean": 3.8243771086854395e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16135.0, + "completions/mean_length": 6872.0859375, + "completions/mean_terminated_length": 6485.42236328125, + "completions/min_length": 609.0, + "completions/min_terminated_length": 609.0, + "entropy": 0.8501477539539337, + "epoch": 0.36522539098436063, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002260354580357671, + "learning_rate": 1e-05, + "loss": 0.0503, + "num_tokens": 331286181.0, + "reward": 0.4921875, + "reward_std": 0.2280302792787552, + "rewards/accuracy_reward/mean": 0.4921875, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999260902404785, + "sampling/importance_sampling_ratio/min": 0.0002785924880299717, + "sampling/sampling_logp_difference/max": 8.185760498046875, + "sampling/sampling_logp_difference/mean": 0.019428331404924393, + "step": 397 + }, + { + "clip_ratio/high_max": 3.206032488378696e-06, + "clip_ratio/high_mean": 8.01508122094674e-07, + "clip_ratio/low_mean": 2.8814496317863814e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9616004439958488e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16187.0, + "completions/mean_length": 6724.546875, + "completions/mean_terminated_length": 6571.22265625, + "completions/min_length": 588.0, + "completions/min_terminated_length": 588.0, + "entropy": 1.0110125690698624, + "epoch": 0.36614535418583255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001649077981710434, + "learning_rate": 1e-05, + "loss": 0.0525, + "num_tokens": 332166003.0, + "reward": 0.421875, + "reward_std": 0.21542152762413025, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999189376831055, + "sampling/importance_sampling_ratio/min": 3.7501690712815616e-06, + "sampling/sampling_logp_difference/max": 12.493709564208984, + "sampling/sampling_logp_difference/mean": 0.020595930516719818, + "step": 398 + }, + { + "clip_ratio/high_max": 1.11491995085089e-05, + "clip_ratio/high_mean": 2.787299877127225e-06, + "clip_ratio/low_mean": 3.4109823332073574e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.689712332288764e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16096.0, + "completions/mean_length": 7833.546875, + "completions/mean_terminated_length": 7485.96728515625, + "completions/min_length": 1509.0, + "completions/min_terminated_length": 1509.0, + "entropy": 0.8942571505904198, + "epoch": 0.3670653173873045, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010421582264825702, + "learning_rate": 1e-05, + "loss": 0.084, + "num_tokens": 333188785.0, + "reward": 0.328125, + "reward_std": 0.22567126154899597, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999600648880005, + "sampling/importance_sampling_ratio/min": 0.0008163535967469215, + "sampling/sampling_logp_difference/max": 7.110662937164307, + "sampling/sampling_logp_difference/mean": 0.018777694553136826, + "step": 399 + }, + { + "clip_ratio/high_max": 1.0101967518494348e-05, + "clip_ratio/high_mean": 2.525491879623587e-06, + "clip_ratio/low_mean": 3.350823226355715e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.603372420002415e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15624.0, + "completions/max_terminated_length": 15624.0, + "completions/mean_length": 7194.96875, + "completions/mean_terminated_length": 7194.96875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 1.0446517765522003, + "epoch": 0.36798528058877644, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002221160801127553, + "learning_rate": 1e-05, + "loss": 0.0284, + "num_tokens": 334128989.0, + "reward": 0.3671875, + "reward_std": 0.26826781034469604, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954879283905, + "sampling/importance_sampling_ratio/min": 8.05134459369583e-06, + "sampling/sampling_logp_difference/max": 11.729671478271484, + "sampling/sampling_logp_difference/mean": 0.021122492849826813, + "step": 400 + }, + { + "clip_ratio/high_max": 5.990032605041051e-06, + "clip_ratio/high_mean": 1.4975081512602628e-06, + "clip_ratio/low_mean": 2.5873220806715835e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.737072884428926e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14241.0, + "completions/mean_length": 7037.875, + "completions/mean_terminated_length": 6657.951171875, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "entropy": 0.9549769386649132, + "epoch": 0.3689052437902484, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0030101474840193987, + "learning_rate": 1e-05, + "loss": 0.0467, + "num_tokens": 335047917.0, + "reward": 0.4375, + "reward_std": 0.2398776262998581, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 2.435619171592407e-05, + "sampling/sampling_logp_difference/max": 10.622724533081055, + "sampling/sampling_logp_difference/mean": 0.02049148827791214, + "step": 401 + }, + { + "clip_ratio/high_max": 8.082625754468609e-06, + "clip_ratio/high_mean": 2.020656438617152e-06, + "clip_ratio/low_mean": 3.1645918625144986e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.366657551850949e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16100.0, + "completions/mean_length": 7596.7890625, + "completions/mean_terminated_length": 7313.33056640625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "entropy": 0.8307650238275528, + "epoch": 0.36982520699172033, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016104152891784906, + "learning_rate": 1e-05, + "loss": 0.0627, + "num_tokens": 336042178.0, + "reward": 0.359375, + "reward_std": 0.27722427248954773, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999981164932251, + "sampling/importance_sampling_ratio/min": 0.007673877757042646, + "sampling/sampling_logp_difference/max": 4.869933128356934, + "sampling/sampling_logp_difference/mean": 0.019274067133665085, + "step": 402 + }, + { + "clip_ratio/high_max": 5.6481858337065205e-06, + "clip_ratio/high_mean": 1.4120464584266301e-06, + "clip_ratio/low_mean": 1.32123756202418e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4624422078668431e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16134.0, + "completions/mean_length": 7060.34375, + "completions/mean_terminated_length": 6836.576171875, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "entropy": 1.0481776595115662, + "epoch": 0.37074517019319225, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010739013087004423, + "learning_rate": 1e-05, + "loss": 0.0452, + "num_tokens": 336963318.0, + "reward": 0.328125, + "reward_std": 0.1733490228652954, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000027060508728, + "sampling/importance_sampling_ratio/min": 0.00011510718468343839, + "sampling/sampling_logp_difference/max": 9.069646835327148, + "sampling/sampling_logp_difference/mean": 0.02168721705675125, + "step": 403 + }, + { + "clip_ratio/high_max": 3.200204901077086e-06, + "clip_ratio/high_mean": 8.000512252692715e-07, + "clip_ratio/low_mean": 1.9099150676993304e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.9899201902262575e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16357.0, + "completions/mean_length": 7746.484375, + "completions/mean_terminated_length": 7609.38134765625, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "entropy": 1.0216905921697617, + "epoch": 0.3716651333946642, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016449482645839453, + "learning_rate": 1e-05, + "loss": 0.0255, + "num_tokens": 337972068.0, + "reward": 0.2421875, + "reward_std": 0.1830747127532959, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.0006486645434051752, + "sampling/sampling_logp_difference/max": 7.34059476852417, + "sampling/sampling_logp_difference/mean": 0.021722178906202316, + "step": 404 + }, + { + "clip_ratio/high_max": 2.161643442377681e-05, + "clip_ratio/high_mean": 5.404108605944202e-06, + "clip_ratio/low_mean": 4.580058657666086e-05, + "clip_ratio/low_min": 4.674994215747574e-06, + "clip_ratio/region_mean": 5.120469540997874e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15784.0, + "completions/mean_length": 6913.3984375, + "completions/mean_terminated_length": 6686.1044921875, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "entropy": 0.9993953481316566, + "epoch": 0.37258509659613614, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003412841120734811, + "learning_rate": 1e-05, + "loss": 0.0358, + "num_tokens": 338876663.0, + "reward": 0.46875, + "reward_std": 0.33797895908355713, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999386668205261, + "sampling/importance_sampling_ratio/min": 0.00012468472414184362, + "sampling/sampling_logp_difference/max": 8.98972225189209, + "sampling/sampling_logp_difference/mean": 0.02173588052392006, + "step": 405 + }, + { + "clip_ratio/high_max": 1.074430110747926e-05, + "clip_ratio/high_mean": 3.5224193766225653e-06, + "clip_ratio/low_mean": 2.64205210100954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9942940273031127e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16192.0, + "completions/mean_length": 7588.6953125, + "completions/mean_terminated_length": 7377.6083984375, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 1.1119055226445198, + "epoch": 0.3735050597976081, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0015972270630300045, + "learning_rate": 1e-05, + "loss": -0.0047, + "num_tokens": 339871184.0, + "reward": 0.28125, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999430775642395, + "sampling/importance_sampling_ratio/min": 0.00015846571477595717, + "sampling/sampling_logp_difference/max": 8.749972343444824, + "sampling/sampling_logp_difference/mean": 0.022462764754891396, + "step": 406 + }, + { + "clip_ratio/high_max": 1.2445105085134855e-05, + "clip_ratio/high_mean": 3.111276271283714e-06, + "clip_ratio/low_mean": 4.525409747202502e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.836537357277848e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16331.0, + "completions/max_terminated_length": 16331.0, + "completions/mean_length": 6522.4453125, + "completions/mean_terminated_length": 6522.4453125, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "entropy": 1.0155515000224113, + "epoch": 0.37442502299908004, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002284019021317363, + "learning_rate": 1e-05, + "loss": 0.0607, + "num_tokens": 340725769.0, + "reward": 0.515625, + "reward_std": 0.28749164938926697, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998612999916077, + "sampling/importance_sampling_ratio/min": 0.0008916885708458722, + "sampling/sampling_logp_difference/max": 7.022393703460693, + "sampling/sampling_logp_difference/mean": 0.02157575450837612, + "step": 407 + }, + { + "clip_ratio/high_max": 1.4456319377131877e-05, + "clip_ratio/high_mean": 3.614079844282969e-06, + "clip_ratio/low_mean": 2.7839718427458138e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1453798442271363e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13961.0, + "completions/mean_length": 6749.125, + "completions/mean_terminated_length": 6517.88818359375, + "completions/min_length": 1156.0, + "completions/min_terminated_length": 1156.0, + "entropy": 1.0721680670976639, + "epoch": 0.37534498620055196, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0010391590185463428, + "learning_rate": 1e-05, + "loss": 0.0622, + "num_tokens": 341610881.0, + "reward": 0.3828125, + "reward_std": 0.1990984082221985, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999426007270813, + "sampling/importance_sampling_ratio/min": 0.00020901163225062191, + "sampling/sampling_logp_difference/max": 8.47312068939209, + "sampling/sampling_logp_difference/mean": 0.02200891077518463, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.4307706237559614e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.4307706237559614e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16374.0, + "completions/mean_length": 7988.4375, + "completions/mean_terminated_length": 7647.154296875, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.9933496564626694, + "epoch": 0.37626494940202393, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022965834941715, + "learning_rate": 1e-05, + "loss": 0.0168, + "num_tokens": 342652897.0, + "reward": 0.328125, + "reward_std": 0.2459382861852646, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.0003876982373185456, + "sampling/sampling_logp_difference/max": 7.855283260345459, + "sampling/sampling_logp_difference/mean": 0.020454837009310722, + "step": 409 + }, + { + "clip_ratio/high_max": 6.58229714645131e-06, + "clip_ratio/high_mean": 1.6455742866128276e-06, + "clip_ratio/low_mean": 3.983285796493874e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.14784317399608e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15968.0, + "completions/mean_length": 7072.4140625, + "completions/mean_terminated_length": 6848.9365234375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.9560660421848297, + "epoch": 0.37718491260349585, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0027879721019417048, + "learning_rate": 1e-05, + "loss": 0.0501, + "num_tokens": 343578670.0, + "reward": 0.5234375, + "reward_std": 0.3043339252471924, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000234842300415, + "sampling/importance_sampling_ratio/min": 0.0001181035113404505, + "sampling/sampling_logp_difference/max": 9.043949127197266, + "sampling/sampling_logp_difference/mean": 0.021169768646359444, + "step": 410 + }, + { + "clip_ratio/high_max": 1.9136705304845236e-05, + "clip_ratio/high_mean": 4.784176326211309e-06, + "clip_ratio/low_mean": 2.449715702823596e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.928133335444727e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15768.0, + "completions/mean_length": 6586.515625, + "completions/mean_terminated_length": 6270.4677734375, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "entropy": 0.893077902495861, + "epoch": 0.3781048758049678, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016929456032812595, + "learning_rate": 1e-05, + "loss": 0.0457, + "num_tokens": 344441080.0, + "reward": 0.4765625, + "reward_std": 0.20175683498382568, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939799308777, + "sampling/importance_sampling_ratio/min": 0.013895876705646515, + "sampling/sampling_logp_difference/max": 4.276163101196289, + "sampling/sampling_logp_difference/mean": 0.019590143114328384, + "step": 411 + }, + { + "clip_ratio/high_max": 1.2621936093637487e-05, + "clip_ratio/high_mean": 3.1554840234093717e-06, + "clip_ratio/low_mean": 5.4418370382336434e-05, + "clip_ratio/low_min": 1.5258214943969506e-05, + "clip_ratio/region_mean": 5.7573854519432643e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15765.0, + "completions/mean_length": 7903.296875, + "completions/mean_terminated_length": 7629.7255859375, + "completions/min_length": 1820.0, + "completions/min_terminated_length": 1820.0, + "entropy": 0.943502850830555, + "epoch": 0.37902483900643974, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0024831818882375956, + "learning_rate": 1e-05, + "loss": 0.0638, + "num_tokens": 345472414.0, + "reward": 0.4765625, + "reward_std": 0.3243142366409302, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999592900276184, + "sampling/importance_sampling_ratio/min": 0.0009350833133794367, + "sampling/sampling_logp_difference/max": 6.974874973297119, + "sampling/sampling_logp_difference/mean": 0.020601853728294373, + "step": 412 + }, + { + "clip_ratio/high_max": 2.738965622484102e-05, + "clip_ratio/high_mean": 9.173523380923143e-06, + "clip_ratio/low_mean": 2.9159931841604703e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.8333455336214683e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15211.0, + "completions/mean_length": 7016.890625, + "completions/mean_terminated_length": 6943.1337890625, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 0.9670446068048477, + "epoch": 0.37994480220791166, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0032182165887206793, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 346388112.0, + "reward": 0.421875, + "reward_std": 0.3306122422218323, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.000258272688370198, + "sampling/sampling_logp_difference/max": 8.261494636535645, + "sampling/sampling_logp_difference/mean": 0.020366424694657326, + "step": 413 + }, + { + "clip_ratio/high_max": 6.399099220288917e-06, + "clip_ratio/high_mean": 1.5997748050722294e-06, + "clip_ratio/low_mean": 2.1530643095957203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3130417901029432e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16038.0, + "completions/mean_length": 7043.8046875, + "completions/mean_terminated_length": 6819.64013671875, + "completions/min_length": 1331.0, + "completions/min_terminated_length": 1331.0, + "entropy": 1.022966854274273, + "epoch": 0.38086476540938363, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0023274575360119343, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 347312071.0, + "reward": 0.3671875, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999589323997498, + "sampling/importance_sampling_ratio/min": 0.0013508512638509274, + "sampling/sampling_logp_difference/max": 6.607020378112793, + "sampling/sampling_logp_difference/mean": 0.021443769335746765, + "step": 414 + }, + { + "clip_ratio/high_max": 1.896051571748103e-05, + "clip_ratio/high_mean": 4.7401289293702575e-06, + "clip_ratio/low_mean": 2.3596727601216116e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.83368563032127e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14908.0, + "completions/mean_length": 6475.6484375, + "completions/mean_terminated_length": 6318.37353515625, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.9873237758874893, + "epoch": 0.38178472861085555, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0008460046374239028, + "learning_rate": 1e-05, + "loss": 0.0639, + "num_tokens": 348161394.0, + "reward": 0.4375, + "reward_std": 0.22620806097984314, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999297261238098, + "sampling/importance_sampling_ratio/min": 0.012015017680823803, + "sampling/sampling_logp_difference/max": 4.421597957611084, + "sampling/sampling_logp_difference/mean": 0.019627809524536133, + "step": 415 + }, + { + "clip_ratio/high_max": 1.9873371229550685e-05, + "clip_ratio/high_mean": 4.968342807387671e-06, + "clip_ratio/low_mean": 4.485099543671822e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.981933852832299e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16243.0, + "completions/mean_length": 8038.90625, + "completions/mean_terminated_length": 7699.67431640625, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.9513615965843201, + "epoch": 0.3827046918123275, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017075197538360953, + "learning_rate": 1e-05, + "loss": 0.0758, + "num_tokens": 349211078.0, + "reward": 0.328125, + "reward_std": 0.30221715569496155, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000017523765564, + "sampling/importance_sampling_ratio/min": 0.0001345122145721689, + "sampling/sampling_logp_difference/max": 8.91385555267334, + "sampling/sampling_logp_difference/mean": 0.020795777440071106, + "step": 416 + }, + { + "clip_ratio/high_max": 3.976459538534982e-06, + "clip_ratio/high_mean": 9.941148846337455e-07, + "clip_ratio/low_mean": 4.385826059660758e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.485237468543346e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16304.0, + "completions/mean_length": 7339.3046875, + "completions/mean_terminated_length": 7122.232421875, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.9872350245714188, + "epoch": 0.38362465501379944, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0016014629509299994, + "learning_rate": 1e-05, + "loss": 0.0609, + "num_tokens": 350171613.0, + "reward": 0.4453125, + "reward_std": 0.2698703408241272, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157190322876, + "sampling/importance_sampling_ratio/min": 1.3763129302901689e-08, + "sampling/sampling_logp_difference/max": 18.101272583007812, + "sampling/sampling_logp_difference/mean": 0.021187925711274147, + "step": 417 + }, + { + "clip_ratio/high_max": 9.294637948187301e-06, + "clip_ratio/high_mean": 2.3236594870468252e-06, + "clip_ratio/low_mean": 2.512099752038921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7444657121122873e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15341.0, + "completions/mean_length": 7239.359375, + "completions/mean_terminated_length": 7094.20654296875, + "completions/min_length": 1294.0, + "completions/min_terminated_length": 1294.0, + "entropy": 0.9430425837635994, + "epoch": 0.3845446182152714, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025373264215886593, + "learning_rate": 1e-05, + "loss": 0.0038, + "num_tokens": 351116803.0, + "reward": 0.5234375, + "reward_std": 0.24671243131160736, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999785423278809, + "sampling/importance_sampling_ratio/min": 0.014291372150182724, + "sampling/sampling_logp_difference/max": 4.248099327087402, + "sampling/sampling_logp_difference/mean": 0.019912682473659515, + "step": 418 + }, + { + "clip_ratio/high_max": 1.5709408671682468e-05, + "clip_ratio/high_mean": 5.310340270625602e-06, + "clip_ratio/low_mean": 3.522799016764111e-05, + "clip_ratio/low_min": 6.063465662009548e-06, + "clip_ratio/region_mean": 4.053833055195355e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15833.0, + "completions/mean_length": 7211.7421875, + "completions/mean_terminated_length": 7066.1513671875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.841051459312439, + "epoch": 0.38546458141674333, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002612616401165724, + "learning_rate": 1e-05, + "loss": 0.1042, + "num_tokens": 352059034.0, + "reward": 0.625, + "reward_std": 0.327729195356369, + "rewards/accuracy_reward/mean": 0.625, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999889731407166, + "sampling/importance_sampling_ratio/min": 2.5700239802972646e-06, + "sampling/sampling_logp_difference/max": 12.87159538269043, + "sampling/sampling_logp_difference/mean": 0.01921844482421875, + "step": 419 + }, + { + "clip_ratio/high_max": 7.196444812507252e-06, + "clip_ratio/high_mean": 1.799111203126813e-06, + "clip_ratio/low_mean": 1.714175300548959e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.894086381071247e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14390.0, + "completions/mean_length": 6374.6953125, + "completions/mean_terminated_length": 6295.8818359375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 1.0578313246369362, + "epoch": 0.38638454461821525, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0019967984408140182, + "learning_rate": 1e-05, + "loss": 0.0363, + "num_tokens": 352896219.0, + "reward": 0.359375, + "reward_std": 0.19438526034355164, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999984860420227, + "sampling/importance_sampling_ratio/min": 0.020119966939091682, + "sampling/sampling_logp_difference/max": 4.295470237731934, + "sampling/sampling_logp_difference/mean": 0.02013232931494713, + "step": 420 + }, + { + "clip_ratio/high_max": 3.095712781941984e-05, + "clip_ratio/high_mean": 7.73928195485496e-06, + "clip_ratio/low_mean": 4.0026389058311906e-05, + "clip_ratio/low_min": 8.968050451585441e-06, + "clip_ratio/region_mean": 4.7765669989985327e-05, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16313.0, + "completions/mean_length": 7274.7109375, + "completions/mean_terminated_length": 6667.42529296875, + "completions/min_length": 1191.0, + "completions/min_terminated_length": 1191.0, + "entropy": 0.7415856420993805, + "epoch": 0.3873045078196872, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0018279170617461205, + "learning_rate": 1e-05, + "loss": 0.067, + "num_tokens": 353844990.0, + "reward": 0.53125, + "reward_std": 0.29696235060691833, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998984336853027, + "sampling/importance_sampling_ratio/min": 0.00019450874242465943, + "sampling/sampling_logp_difference/max": 8.54503345489502, + "sampling/sampling_logp_difference/mean": 0.017373956739902496, + "step": 421 + }, + { + "clip_ratio/high_max": 1.3592496998171555e-05, + "clip_ratio/high_mean": 3.3981242495428887e-06, + "clip_ratio/low_mean": 4.277909783922951e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.6177221065590857e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15676.0, + "completions/mean_length": 7902.9296875, + "completions/mean_terminated_length": 7836.1494140625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 1.0019611343741417, + "epoch": 0.38822447102115915, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001777544734068215, + "learning_rate": 1e-05, + "loss": 0.0534, + "num_tokens": 354873933.0, + "reward": 0.3125, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.001003989833407104, + "sampling/sampling_logp_difference/max": 6.903773307800293, + "sampling/sampling_logp_difference/mean": 0.021197015419602394, + "step": 422 + }, + { + "clip_ratio/high_max": 2.524704336792638e-05, + "clip_ratio/high_mean": 7.122522617919458e-06, + "clip_ratio/low_mean": 2.635721989463491e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.347974279677146e-05, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14700.0, + "completions/mean_length": 7304.046875, + "completions/mean_terminated_length": 6617.328125, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.8584602400660515, + "epoch": 0.3891444342226311, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.00153827341273427, + "learning_rate": 1e-05, + "loss": 0.0813, + "num_tokens": 355829507.0, + "reward": 0.3671875, + "reward_std": 0.2982654273509979, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999763369560242, + "sampling/importance_sampling_ratio/min": 3.820072379312478e-05, + "sampling/sampling_logp_difference/max": 10.172656059265137, + "sampling/sampling_logp_difference/mean": 0.019642215222120285, + "step": 423 + }, + { + "clip_ratio/high_max": 5.025731752539286e-06, + "clip_ratio/high_mean": 1.2564329381348216e-06, + "clip_ratio/low_mean": 3.204466929673799e-05, + "clip_ratio/low_min": 3.388819550309563e-06, + "clip_ratio/region_mean": 3.330110212118598e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15074.0, + "completions/mean_length": 5677.21875, + "completions/mean_terminated_length": 5507.27001953125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 1.0159753635525703, + "epoch": 0.39006439742410304, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002107275417074561, + "learning_rate": 1e-05, + "loss": 0.0311, + "num_tokens": 356573231.0, + "reward": 0.5, + "reward_std": 0.25354719161987305, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999579191207886, + "sampling/importance_sampling_ratio/min": 0.0019436449510976672, + "sampling/sampling_logp_difference/max": 6.243190288543701, + "sampling/sampling_logp_difference/mean": 0.020722679793834686, + "step": 424 + }, + { + "clip_ratio/high_max": 1.4743651718163164e-05, + "clip_ratio/high_mean": 3.685912929540791e-06, + "clip_ratio/low_mean": 1.6582721229951858e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0268634500553162e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15407.0, + "completions/max_terminated_length": 15407.0, + "completions/mean_length": 6209.078125, + "completions/mean_terminated_length": 6209.078125, + "completions/min_length": 723.0, + "completions/min_terminated_length": 723.0, + "entropy": 0.8867508247494698, + "epoch": 0.39098436062557496, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001720887958072126, + "learning_rate": 1e-05, + "loss": 0.0828, + "num_tokens": 357387169.0, + "reward": 0.5703125, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.5703125, + "rewards/accuracy_reward/std": 0.4969765841960907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000000238418579, + "sampling/importance_sampling_ratio/min": 4.222915777063463e-06, + "sampling/sampling_logp_difference/max": 12.374984741210938, + "sampling/sampling_logp_difference/mean": 0.017990771681070328, + "step": 425 + }, + { + "clip_ratio/high_max": 7.870049557823222e-06, + "clip_ratio/high_mean": 1.9675123894558055e-06, + "clip_ratio/low_mean": 1.6993449889923795e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.89609622793796e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15845.0, + "completions/mean_length": 7183.3671875, + "completions/mean_terminated_length": 6962.55224609375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.9918289259076118, + "epoch": 0.39190432382704693, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0012448625639081001, + "learning_rate": 1e-05, + "loss": 0.0853, + "num_tokens": 358334584.0, + "reward": 0.328125, + "reward_std": 0.17464719712734222, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999426007270813, + "sampling/importance_sampling_ratio/min": 0.00038028976996429265, + "sampling/sampling_logp_difference/max": 7.874577045440674, + "sampling/sampling_logp_difference/mean": 0.020646382123231888, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.9313079608073167e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9313079608073167e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15106.0, + "completions/mean_length": 6877.1484375, + "completions/mean_terminated_length": 6802.29150390625, + "completions/min_length": 2027.0, + "completions/min_terminated_length": 2027.0, + "entropy": 0.8806835636496544, + "epoch": 0.39282428702851885, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.001519464422017336, + "learning_rate": 1e-05, + "loss": 0.0686, + "num_tokens": 359233451.0, + "reward": 0.375, + "reward_std": 0.19438527524471283, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998905658721924, + "sampling/importance_sampling_ratio/min": 0.008662254549562931, + "sampling/sampling_logp_difference/max": 4.748780250549316, + "sampling/sampling_logp_difference/mean": 0.01951739378273487, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.164141705587099e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.164141705587099e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16061.0, + "completions/max_terminated_length": 16061.0, + "completions/mean_length": 6964.6875, + "completions/mean_terminated_length": 6964.6875, + "completions/min_length": 1148.0, + "completions/min_terminated_length": 1148.0, + "entropy": 0.8069597631692886, + "epoch": 0.3937442502299908, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022954042069613934, + "learning_rate": 1e-05, + "loss": 0.1217, + "num_tokens": 360143003.0, + "reward": 0.53125, + "reward_std": 0.3253750801086426, + "rewards/accuracy_reward/mean": 0.53125, + "rewards/accuracy_reward/std": 0.5009832978248596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999755620956421, + "sampling/importance_sampling_ratio/min": 0.00020347768440842628, + "sampling/sampling_logp_difference/max": 8.499954223632812, + "sampling/sampling_logp_difference/mean": 0.01880607008934021, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 1.7779158497432945e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.7779158497432945e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16123.0, + "completions/mean_length": 7322.0, + "completions/mean_terminated_length": 7178.1591796875, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 1.0852478593587875, + "epoch": 0.39466421343146274, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0021376016084104776, + "learning_rate": 1e-05, + "loss": 0.0094, + "num_tokens": 361101379.0, + "reward": 0.3046875, + "reward_std": 0.15308690071105957, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.00011516757513163611, + "sampling/sampling_logp_difference/max": 9.069122314453125, + "sampling/sampling_logp_difference/mean": 0.021568164229393005, + "step": 429 + }, + { + "clip_ratio/high_max": 3.1260904052032856e-05, + "clip_ratio/high_mean": 8.905177651286067e-06, + "clip_ratio/low_mean": 4.4633561628870666e-05, + "clip_ratio/low_min": 4.338168764661532e-06, + "clip_ratio/region_mean": 5.353873848434887e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15880.0, + "completions/mean_length": 7027.0078125, + "completions/mean_terminated_length": 6646.64208984375, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "entropy": 0.8932972475886345, + "epoch": 0.39558417663293466, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0031003563199192286, + "learning_rate": 1e-05, + "loss": 0.0875, + "num_tokens": 362018284.0, + "reward": 0.5, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.5, + "rewards/accuracy_reward/std": 0.5019646286964417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999800324440002, + "sampling/importance_sampling_ratio/min": 0.0010351726086810231, + "sampling/sampling_logp_difference/max": 6.873187065124512, + "sampling/sampling_logp_difference/mean": 0.020102323964238167, + "step": 430 + }, + { + "clip_ratio/high_max": 1.5146189525694354e-05, + "clip_ratio/high_mean": 4.871089572588971e-06, + "clip_ratio/low_mean": 4.263560651907028e-05, + "clip_ratio/low_min": 1.8708525658439612e-05, + "clip_ratio/region_mean": 4.7506695409538224e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15430.0, + "completions/mean_length": 6341.7421875, + "completions/mean_terminated_length": 6262.66943359375, + "completions/min_length": 939.0, + "completions/min_terminated_length": 939.0, + "entropy": 0.885854922235012, + "epoch": 0.39650413983440663, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018177316524088383, + "learning_rate": 1e-05, + "loss": 0.0701, + "num_tokens": 362851107.0, + "reward": 0.5234375, + "reward_std": 0.28171277046203613, + "rewards/accuracy_reward/mean": 0.5234375, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999906063079834, + "sampling/importance_sampling_ratio/min": 0.0005522234132513404, + "sampling/sampling_logp_difference/max": 7.50155782699585, + "sampling/sampling_logp_difference/mean": 0.020463842898607254, + "step": 431 + }, + { + "clip_ratio/high_max": 1.9989562133559957e-05, + "clip_ratio/high_mean": 5.9246351611363934e-06, + "clip_ratio/low_mean": 3.242748857701372e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.835212419289746e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12672.0, + "completions/mean_length": 6388.875, + "completions/mean_terminated_length": 6310.17333984375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.9593783840537071, + "epoch": 0.39742410303587855, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001520519028417766, + "learning_rate": 1e-05, + "loss": 0.0503, + "num_tokens": 363691019.0, + "reward": 0.328125, + "reward_std": 0.2972046136856079, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000056028366089, + "sampling/importance_sampling_ratio/min": 0.0011127673787996173, + "sampling/sampling_logp_difference/max": 6.800905227661133, + "sampling/sampling_logp_difference/mean": 0.019675832241773605, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.4561562668168335e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4561562668168335e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15240.0, + "completions/mean_length": 7559.875, + "completions/mean_terminated_length": 7125.9013671875, + "completions/min_length": 1292.0, + "completions/min_terminated_length": 1292.0, + "entropy": 0.8298296853899956, + "epoch": 0.3983440662373505, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016424815403297544, + "learning_rate": 1e-05, + "loss": 0.026, + "num_tokens": 364679475.0, + "reward": 0.4765625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000078678131104, + "sampling/importance_sampling_ratio/min": 0.00026268011424690485, + "sampling/sampling_logp_difference/max": 8.244573593139648, + "sampling/sampling_logp_difference/mean": 0.01943236216902733, + "step": 433 + }, + { + "clip_ratio/high_max": 9.62110971158836e-06, + "clip_ratio/high_mean": 2.40527742789709e-06, + "clip_ratio/low_mean": 3.785217859331169e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.025745568014827e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16162.0, + "completions/mean_length": 5993.1328125, + "completions/mean_terminated_length": 5743.75244140625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.9428447484970093, + "epoch": 0.39926402943882244, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0020216500852257013, + "learning_rate": 1e-05, + "loss": 0.0383, + "num_tokens": 365464588.0, + "reward": 0.5078125, + "reward_std": 0.28353995084762573, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999315142631531, + "sampling/importance_sampling_ratio/min": 0.0006411138456314802, + "sampling/sampling_logp_difference/max": 7.352303504943848, + "sampling/sampling_logp_difference/mean": 0.0196966715157032, + "step": 434 + }, + { + "clip_ratio/high_max": 1.3527967894333415e-05, + "clip_ratio/high_mean": 3.3819919735833537e-06, + "clip_ratio/low_mean": 2.5303937945864163e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8685930146821192e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15118.0, + "completions/mean_length": 5325.0390625, + "completions/mean_terminated_length": 5149.50048828125, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.7730643972754478, + "epoch": 0.40018399264029436, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0017544744769111276, + "learning_rate": 1e-05, + "loss": 0.0816, + "num_tokens": 366167481.0, + "reward": 0.671875, + "reward_std": 0.30091896653175354, + "rewards/accuracy_reward/mean": 0.671875, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000058889389038, + "sampling/importance_sampling_ratio/min": 0.0008072834461927414, + "sampling/sampling_logp_difference/max": 7.121835708618164, + "sampling/sampling_logp_difference/mean": 0.01736798696219921, + "step": 435 + }, + { + "clip_ratio/high_max": 8.82370454746706e-06, + "clip_ratio/high_mean": 3.1566120810566645e-06, + "clip_ratio/low_mean": 2.7905126785299217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1061739150572976e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15321.0, + "completions/mean_length": 7331.3359375, + "completions/mean_terminated_length": 7114.072265625, + "completions/min_length": 1160.0, + "completions/min_terminated_length": 1160.0, + "entropy": 0.9418040588498116, + "epoch": 0.40110395584176634, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0013123912503942847, + "learning_rate": 1e-05, + "loss": 0.0445, + "num_tokens": 367126948.0, + "reward": 0.453125, + "reward_std": 0.3243093490600586, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999157786369324, + "sampling/importance_sampling_ratio/min": 0.0023285788483917713, + "sampling/sampling_logp_difference/max": 6.062497138977051, + "sampling/sampling_logp_difference/mean": 0.020918458700180054, + "step": 436 + }, + { + "clip_ratio/high_max": 1.6637134194752434e-05, + "clip_ratio/high_mean": 4.1592835486881086e-06, + "clip_ratio/low_mean": 4.105965246026244e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.521893566789004e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16209.0, + "completions/mean_length": 7217.25, + "completions/mean_terminated_length": 6686.94189453125, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.9499563127756119, + "epoch": 0.40202391904323825, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0021181178744882345, + "learning_rate": 1e-05, + "loss": 0.0656, + "num_tokens": 368071772.0, + "reward": 0.453125, + "reward_std": 0.3593195080757141, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.4997538626194, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998823404312134, + "sampling/importance_sampling_ratio/min": 0.000675773830153048, + "sampling/sampling_logp_difference/max": 7.299652099609375, + "sampling/sampling_logp_difference/mean": 0.020650038495659828, + "step": 437 + }, + { + "clip_ratio/high_max": 1.2043050901411334e-05, + "clip_ratio/high_mean": 3.0107627253528335e-06, + "clip_ratio/low_mean": 3.4911336570075946e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.792209963648929e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15669.0, + "completions/mean_length": 7549.796875, + "completions/mean_terminated_length": 7264.822265625, + "completions/min_length": 1037.0, + "completions/min_terminated_length": 1037.0, + "entropy": 1.0309365764260292, + "epoch": 0.4029438822447102, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0019147706916555762, + "learning_rate": 1e-05, + "loss": 0.0159, + "num_tokens": 369055650.0, + "reward": 0.359375, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999600648880005, + "sampling/importance_sampling_ratio/min": 0.0010221411939710379, + "sampling/sampling_logp_difference/max": 6.885855674743652, + "sampling/sampling_logp_difference/mean": 0.02183394506573677, + "step": 438 + }, + { + "clip_ratio/high_max": 3.9433421079593245e-06, + "clip_ratio/high_mean": 9.858355269898311e-07, + "clip_ratio/low_mean": 3.9529069113086734e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.051490452638973e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15269.0, + "completions/mean_length": 6727.53125, + "completions/mean_terminated_length": 6651.49609375, + "completions/min_length": 1157.0, + "completions/min_terminated_length": 1157.0, + "entropy": 0.9676288217306137, + "epoch": 0.40386384544618215, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0031485585495829582, + "learning_rate": 1e-05, + "loss": 0.095, + "num_tokens": 369938574.0, + "reward": 0.4765625, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000512599945068, + "sampling/importance_sampling_ratio/min": 0.000710509717464447, + "sampling/sampling_logp_difference/max": 7.249527931213379, + "sampling/sampling_logp_difference/mean": 0.020127974450588226, + "step": 439 + }, + { + "clip_ratio/high_max": 1.0043262818726362e-05, + "clip_ratio/high_mean": 2.5108157046815904e-06, + "clip_ratio/low_mean": 3.8503443363424594e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.101425872704567e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15864.0, + "completions/mean_length": 6281.7265625, + "completions/mean_terminated_length": 5955.8466796875, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "entropy": 0.9817835092544556, + "epoch": 0.4047838086476541, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.003415121464058757, + "learning_rate": 1e-05, + "loss": 0.0332, + "num_tokens": 370760459.0, + "reward": 0.375, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.4860251843929291, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999826550483704, + "sampling/importance_sampling_ratio/min": 4.502153956309485e-07, + "sampling/sampling_logp_difference/max": 14.613539695739746, + "sampling/sampling_logp_difference/mean": 0.02063862606883049, + "step": 440 + }, + { + "clip_ratio/high_max": 2.3593061087012757e-05, + "clip_ratio/high_mean": 7.003677183092805e-06, + "clip_ratio/low_mean": 1.8947657395074202e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5951335032914358e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16272.0, + "completions/mean_length": 6765.71875, + "completions/mean_terminated_length": 6689.984375, + "completions/min_length": 1444.0, + "completions/min_terminated_length": 1444.0, + "entropy": 1.0270514711737633, + "epoch": 0.40570377184912604, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.00214037811383605, + "learning_rate": 1e-05, + "loss": 0.0366, + "num_tokens": 371649103.0, + "reward": 0.4765625, + "reward_std": 0.1830746978521347, + "rewards/accuracy_reward/mean": 0.4765625, + "rewards/accuracy_reward/std": 0.5014128684997559, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999620318412781, + "sampling/importance_sampling_ratio/min": 0.001930466154590249, + "sampling/sampling_logp_difference/max": 6.249993801116943, + "sampling/sampling_logp_difference/mean": 0.02172943949699402, + "step": 441 + }, + { + "clip_ratio/high_max": 2.1009727788623422e-05, + "clip_ratio/high_mean": 6.259035217226483e-06, + "clip_ratio/low_mean": 5.011202529203729e-05, + "clip_ratio/low_min": 3.1568047234031837e-06, + "clip_ratio/region_mean": 5.637106050926377e-05, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16159.0, + "completions/mean_length": 7481.625, + "completions/mean_terminated_length": 6966.611328125, + "completions/min_length": 782.0, + "completions/min_terminated_length": 782.0, + "entropy": 0.9730701074004173, + "epoch": 0.40662373505059796, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.003510556183755398, + "learning_rate": 1e-05, + "loss": 0.0022, + "num_tokens": 372624535.0, + "reward": 0.4140625, + "reward_std": 0.3464162349700928, + "rewards/accuracy_reward/mean": 0.4140625, + "rewards/accuracy_reward/std": 0.49449479579925537, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.0003729084855876863, + "sampling/sampling_logp_difference/max": 7.894177436828613, + "sampling/sampling_logp_difference/mean": 0.02149931713938713, + "step": 442 + }, + { + "clip_ratio/high_max": 2.8992230909352656e-06, + "clip_ratio/high_mean": 7.248057727338164e-07, + "clip_ratio/low_mean": 3.781230475397024e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.853711018564354e-05, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15994.0, + "completions/mean_length": 8474.2109375, + "completions/mean_terminated_length": 8152.67431640625, + "completions/min_length": 983.0, + "completions/min_terminated_length": 983.0, + "entropy": 0.9761426225304604, + "epoch": 0.40754369825206993, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0018307552672922611, + "learning_rate": 1e-05, + "loss": 0.062, + "num_tokens": 373732962.0, + "reward": 0.328125, + "reward_std": 0.3214184641838074, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999135732650757, + "sampling/importance_sampling_ratio/min": 1.6381112288854638e-07, + "sampling/sampling_logp_difference/max": 15.624551773071289, + "sampling/sampling_logp_difference/mean": 0.02121492102742195, + "step": 443 + }, + { + "clip_ratio/high_max": 7.689794983889442e-06, + "clip_ratio/high_mean": 1.9224487459723605e-06, + "clip_ratio/low_mean": 4.332422963670979e-05, + "clip_ratio/low_min": 5.504910404852126e-06, + "clip_ratio/region_mean": 4.5246677473187447e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15117.0, + "completions/max_terminated_length": 15117.0, + "completions/mean_length": 7433.953125, + "completions/mean_terminated_length": 7433.953125, + "completions/min_length": 1108.0, + "completions/min_terminated_length": 1108.0, + "entropy": 1.0665365010499954, + "epoch": 0.40846366145354185, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021801323164254427, + "learning_rate": 1e-05, + "loss": -0.0046, + "num_tokens": 374706548.0, + "reward": 0.421875, + "reward_std": 0.2590789198875427, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.4957992732524872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 1.2762369294705422e-07, + "sampling/sampling_logp_difference/max": 15.87417984008789, + "sampling/sampling_logp_difference/mean": 0.022046178579330444, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.7811285235657124e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7811285235657124e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15989.0, + "completions/mean_length": 7646.7265625, + "completions/mean_terminated_length": 7217.0244140625, + "completions/min_length": 1019.0, + "completions/min_terminated_length": 1019.0, + "entropy": 0.9308071210980415, + "epoch": 0.4093836246550138, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0014674996491521597, + "learning_rate": 1e-05, + "loss": 0.0225, + "num_tokens": 375706673.0, + "reward": 0.328125, + "reward_std": 0.1820138692855835, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000466108322144, + "sampling/importance_sampling_ratio/min": 2.4991354621306527e-06, + "sampling/sampling_logp_difference/max": 12.899565696716309, + "sampling/sampling_logp_difference/mean": 0.018912145867943764, + "step": 445 + }, + { + "clip_ratio/high_max": 2.569714251876576e-05, + "clip_ratio/high_mean": 6.42428562969144e-06, + "clip_ratio/low_mean": 2.5548037910994026e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.1972323540685466e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15082.0, + "completions/mean_length": 6046.46875, + "completions/mean_terminated_length": 5965.07080078125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.9040833190083504, + "epoch": 0.41030358785648574, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002320521976798773, + "learning_rate": 1e-05, + "loss": 0.0489, + "num_tokens": 376506613.0, + "reward": 0.515625, + "reward_std": 0.2835350036621094, + "rewards/accuracy_reward/mean": 0.515625, + "rewards/accuracy_reward/std": 0.5017194747924805, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999364614486694, + "sampling/importance_sampling_ratio/min": 2.462414704496041e-05, + "sampling/sampling_logp_difference/max": 10.611783027648926, + "sampling/sampling_logp_difference/mean": 0.018557455390691757, + "step": 446 + }, + { + "clip_ratio/high_max": 1.3460261698128306e-05, + "clip_ratio/high_mean": 4.301844171550329e-06, + "clip_ratio/low_mean": 5.543450777167891e-05, + "clip_ratio/low_min": 1.7309802160525578e-05, + "clip_ratio/region_mean": 5.973635086320428e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 7435.53125, + "completions/mean_terminated_length": 7220.7685546875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 1.0237125977873802, + "epoch": 0.41122355105795766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004084885586053133, + "learning_rate": 1e-05, + "loss": 0.1085, + "num_tokens": 377476249.0, + "reward": 0.3984375, + "reward_std": 0.29538238048553467, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999234676361084, + "sampling/importance_sampling_ratio/min": 0.0008572525111958385, + "sampling/sampling_logp_difference/max": 7.0617780685424805, + "sampling/sampling_logp_difference/mean": 0.02096719481050968, + "step": 447 + }, + { + "clip_ratio/high_max": 2.7470227905723732e-05, + "clip_ratio/high_mean": 8.26576740564633e-06, + "clip_ratio/low_mean": 3.730497360265872e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.557074043987086e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16351.0, + "completions/mean_length": 7228.1953125, + "completions/mean_terminated_length": 7008.45654296875, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "entropy": 0.9323876351118088, + "epoch": 0.41214351425942963, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0025359690189361572, + "learning_rate": 1e-05, + "loss": 0.1291, + "num_tokens": 378423338.0, + "reward": 0.5078125, + "reward_std": 0.3182457685470581, + "rewards/accuracy_reward/mean": 0.5078125, + "rewards/accuracy_reward/std": 0.5019033551216125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 0.00026116587105207145, + "sampling/sampling_logp_difference/max": 8.250354766845703, + "sampling/sampling_logp_difference/mean": 0.020272942259907722, + "step": 448 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 378423338, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py b/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..5995d6e6f04e43b989587aa9022a3aef0c66d694 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-448/zero_to_fp32.py @@ -0,0 +1,760 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: +# python zero_to_fp32.py . output_dir/ +# or +# python zero_to_fp32.py . output_dir/ --safe_serialization + +import argparse +import torch +import glob +import math +import os +import re +import gc +import json +import numpy as np +from tqdm import tqdm +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device, weights_only=False) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + total_files = len(files) + state_dicts = [] + for f in tqdm(files, desc='Loading checkpoint shards'): + state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +class GatheredTensor: + """ + A pseudo tensor that collects partitioned weights. + It is more memory efficient when there are multiple groups. + """ + + def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape): + self.flat_groups = flat_groups + self.flat_groups_offset = flat_groups_offset + self.offset = offset + self.partitioned_numel = partitioned_numel + self.shape = shape + self.dtype = self.flat_groups[0][0].dtype + + def contiguous(self): + """ + Merge partitioned weights from flat_groups into a single tensor. + """ + end_idx = self.offset + self.partitioned_numel + world_size = len(self.flat_groups) + pad_flat_param_chunks = [] + + for rank_i in range(world_size): + # for each rank, we need to collect weights from related group/groups + flat_groups_at_rank_i = self.flat_groups[rank_i] + start_group_id = None + end_group_id = None + for group_id in range(len(self.flat_groups_offset)): + if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]: + start_group_id = group_id + if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]: + end_group_id = group_id + break + # collect weights from related group/groups + for group_id in range(start_group_id, end_group_id + 1): + flat_tensor = flat_groups_at_rank_i[group_id] + start_offset = self.offset - self.flat_groups_offset[group_id] + end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id] + pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset]) + + # collect weights from all ranks + pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0) + param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous() + return param + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size + + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]])) + for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'): + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # memory efficient tensor + tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape) + state_dict[name] = tensor + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def to_torch_tensor(state_dict, return_empty_tensor=False): + """ + Convert state_dict of GatheredTensor to torch tensor + """ + torch_state_dict = {} + converted_tensors = {} + for name, tensor in state_dict.items(): + tensor_id = id(tensor) + if tensor_id in converted_tensors: # shared tensors + shared_tensor = torch_state_dict[converted_tensors[tensor_id]] + torch_state_dict[name] = shared_tensor + else: + converted_tensors[tensor_id] = name + if return_empty_tensor: + torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype) + else: + torch_state_dict[name] = tensor.contiguous() + return torch_state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag=None, + exclude_frozen_parameters=False, + lazy_mode=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient. + Convert the pesduo tensor to torch tensor by ``.contiguous()`` + + Returns: + - pytorch ``state_dict`` + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + Note: the above usage may not work if your application doesn't have sufficient free CPU memory. + You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. Or you can load state_dict in lazy mode :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu + for name, lazy_tensor in state_dict.item(): + tensor = lazy_tensor.contiguous() # to cpu + print(name, tensor) + # del tensor to release memory if it no longer in use + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + if lazy_mode: + return state_dict + else: + return to_torch_tensor(state_dict) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, + output_dir, + max_shard_size="5GB", + safe_serialization=False, + tag=None, + exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_dir``: directory to the pytorch fp32 state_dict output files + - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB + - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + # Dependency pre-check + if safe_serialization: + try: + from safetensors.torch import save_file + except ImportError: + print('If you want to use `safe_serialization`, please `pip install safetensors`') + raise + if max_shard_size is not None: + try: + from huggingface_hub import split_torch_state_dict_into_shards + except ImportError: + print('If you want to use `max_shard_size`, please `pip install huggingface_hub`') + raise + + # Convert zero checkpoint to state_dict + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, + tag, + exclude_frozen_parameters, + lazy_mode=True) + + # Shard the model if it is too big. + weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin" + if max_shard_size is not None: + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + # an memory-efficient approach for sharding + empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True) + state_dict_split = split_torch_state_dict_into_shards(empty_state_dict, + filename_pattern=filename_pattern, + max_shard_size=max_shard_size) + else: + from collections import namedtuple + StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"]) + state_dict_split = StateDictSplit(is_sharded=False, + filename_to_tensors={weights_name: list(state_dict.keys())}) + + # Save the model by shard + os.makedirs(output_dir, exist_ok=True) + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"): + shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors} + shard_state_dict = to_torch_tensor(shard_state_dict) + output_path = os.path.join(output_dir, shard_file) + if safe_serialization: + save_file(shard_state_dict, output_path, metadata={"format": "pt"}) + else: + torch.save(shard_state_dict, output_path) + # release the memory of current shard + for tensor_name in list(shard_state_dict.keys()): + del state_dict[tensor_name] + del shard_state_dict[tensor_name] + del shard_state_dict + gc.collect() + + # Save index if sharded + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json" + save_index_file = os.path.join(output_dir, save_index_file) + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info("Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info("Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument("output_dir", + type=str, + help="directory to the pytorch fp32 state_dict output files" + "(e.g. path/checkpoint-12-output/)") + parser.add_argument( + "--max_shard_size", + type=str, + default="5GB", + help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size" + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`" + "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances" + "without CPU OOM issues.") + parser.add_argument( + "--safe_serialization", + default=False, + action='store_true', + help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/README.md b/dapo_lora_plus_20251202_001141/checkpoint-64/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json b/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..57b1340e85011632bb78b2fd3b13b455f6b0d622 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj", + "up_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja b/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..c2066bd7391c270626e39c9d7124f00360126412 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/chat_template.jinja @@ -0,0 +1 @@ +{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\n'}}{% endif %} \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/latest b/dapo_lora_plus_20251202_001141/checkpoint-64/latest new file mode 100644 index 0000000000000000000000000000000000000000..4a12e7f9029554e8e5ce68ebe3e97d0b4e734304 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/latest @@ -0,0 +1 @@ +global_step64 \ No newline at end of file diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json b/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json b/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json b/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c211faa74990af61cf4d03795dd1b2c15f6e5375 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/checkpoint-64/trainer_state.json @@ -0,0 +1,2018 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05887764489420423, + "eval_steps": 500, + "global_step": 64, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15689.0, + "completions/max_terminated_length": 15689.0, + "completions/mean_length": 6039.171875, + "completions/mean_terminated_length": 6039.171875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 1.19118632376194, + "epoch": 0.0009199632014719411, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0025745572056621313, + "learning_rate": 1e-05, + "loss": 0.0591, + "num_tokens": 792270.0, + "reward": 0.25, + "reward_std": 0.24435341358184814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999940395355225, + "sampling/importance_sampling_ratio/min": 0.0002457273658365011, + "sampling/sampling_logp_difference/max": 8.311287879943848, + "sampling/sampling_logp_difference/mean": 0.021642697975039482, + "step": 1 + }, + { + "clip_ratio/high_max": 5.499582130141789e-06, + "clip_ratio/high_mean": 1.3748955325354473e-06, + "clip_ratio/low_mean": 2.871888784738985e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.009378326623846e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16292.0, + "completions/max_terminated_length": 16292.0, + "completions/mean_length": 4767.1875, + "completions/mean_terminated_length": 4767.1875, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.088237851858139, + "epoch": 0.0018399264029438822, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002068034838885069, + "learning_rate": 1e-05, + "loss": 0.0258, + "num_tokens": 1425798.0, + "reward": 0.3046875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999016523361206, + "sampling/importance_sampling_ratio/min": 0.01811397261917591, + "sampling/sampling_logp_difference/max": 4.011071681976318, + "sampling/sampling_logp_difference/mean": 0.01877593621611595, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.459846724103045e-05, + "clip_ratio/low_min": 3.4060874440910993e-06, + "clip_ratio/region_mean": 4.459846724103045e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16317.0, + "completions/mean_length": 6586.359375, + "completions/mean_terminated_length": 6351.21630859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0497623533010483, + "epoch": 0.0027598896044158236, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.001971944235265255, + "learning_rate": 1e-05, + "loss": 0.0199, + "num_tokens": 2287420.0, + "reward": 0.28125, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999316334724426, + "sampling/importance_sampling_ratio/min": 5.356698966352269e-05, + "sampling/sampling_logp_difference/max": 9.834577560424805, + "sampling/sampling_logp_difference/mean": 0.02137824520468712, + "step": 3 + }, + { + "clip_ratio/high_max": 1.7640652004047297e-05, + "clip_ratio/high_mean": 5.48578327652649e-06, + "clip_ratio/low_mean": 3.218628648937738e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.767206976590387e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14690.0, + "completions/max_terminated_length": 14690.0, + "completions/mean_length": 5448.0234375, + "completions/mean_terminated_length": 5448.0234375, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 1.1134418621659279, + "epoch": 0.0036798528058877645, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016465173102915287, + "learning_rate": 1e-05, + "loss": 0.0433, + "num_tokens": 3009167.0, + "reward": 0.2890625, + "reward_std": 0.27958330512046814, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 7.889385415182915e-06, + "sampling/sampling_logp_difference/max": 11.749992370605469, + "sampling/sampling_logp_difference/mean": 0.020580951124429703, + "step": 4 + }, + { + "clip_ratio/high_max": 1.3439519989333348e-05, + "clip_ratio/high_mean": 3.359879997333337e-06, + "clip_ratio/low_mean": 2.8849915906903334e-05, + "clip_ratio/low_min": 8.467687621305231e-06, + "clip_ratio/region_mean": 3.220979442630778e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13420.0, + "completions/mean_length": 5436.8671875, + "completions/mean_terminated_length": 5350.66943359375, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "entropy": 1.1473859176039696, + "epoch": 0.004599816007359705, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0023770295083522797, + "learning_rate": 1e-05, + "loss": 0.0153, + "num_tokens": 3725654.0, + "reward": 0.2734375, + "reward_std": 0.27434611320495605, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99991774559021, + "sampling/importance_sampling_ratio/min": 0.0011146117467433214, + "sampling/sampling_logp_difference/max": 6.799249172210693, + "sampling/sampling_logp_difference/mean": 0.020377254113554955, + "step": 5 + }, + { + "clip_ratio/high_max": 4.652201369026443e-06, + "clip_ratio/high_mean": 1.1630503422566107e-06, + "clip_ratio/low_mean": 2.8399212624208303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9562263534899103e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14440.0, + "completions/max_terminated_length": 14440.0, + "completions/mean_length": 4697.5390625, + "completions/mean_terminated_length": 4697.5390625, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 1.0097229778766632, + "epoch": 0.005519779208831647, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003342699259519577, + "learning_rate": 1e-05, + "loss": 0.0326, + "num_tokens": 4345547.0, + "reward": 0.390625, + "reward_std": 0.34480881690979004, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.002385853324085474, + "sampling/sampling_logp_difference/max": 6.038198471069336, + "sampling/sampling_logp_difference/mean": 0.0185473021119833, + "step": 6 + }, + { + "clip_ratio/high_max": 9.362594937556423e-06, + "clip_ratio/high_mean": 2.340648734389106e-06, + "clip_ratio/low_mean": 6.054362825125281e-05, + "clip_ratio/low_min": 7.427356649714056e-06, + "clip_ratio/region_mean": 6.288427744038927e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14652.0, + "completions/mean_length": 6218.2109375, + "completions/mean_terminated_length": 5890.2822265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 1.0579778030514717, + "epoch": 0.006439742410303588, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002073560608550906, + "learning_rate": 1e-05, + "loss": 0.0201, + "num_tokens": 5160646.0, + "reward": 0.2109375, + "reward_std": 0.27222445607185364, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.00044544730917550623, + "sampling/sampling_logp_difference/max": 7.716431617736816, + "sampling/sampling_logp_difference/mean": 0.020321575924754143, + "step": 7 + }, + { + "clip_ratio/high_max": 1.1064067621191498e-05, + "clip_ratio/high_mean": 2.7660169052978745e-06, + "clip_ratio/low_mean": 2.2175867059104348e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.4941883737028547e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13637.0, + "completions/mean_length": 5127.8359375, + "completions/mean_terminated_length": 5039.20458984375, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 1.0472618415951729, + "epoch": 0.007359705611775529, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0032994600478559732, + "learning_rate": 1e-05, + "loss": 0.0751, + "num_tokens": 5836289.0, + "reward": 0.3359375, + "reward_std": 0.2948455810546875, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483227729797, + "sampling/importance_sampling_ratio/min": 0.0013780994340777397, + "sampling/sampling_logp_difference/max": 6.587049961090088, + "sampling/sampling_logp_difference/mean": 0.01940803974866867, + "step": 8 + }, + { + "clip_ratio/high_max": 1.2357884770608507e-05, + "clip_ratio/high_mean": 3.0894711926521268e-06, + "clip_ratio/low_mean": 3.000627111759968e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.309574231025181e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15916.0, + "completions/mean_length": 4516.890625, + "completions/mean_terminated_length": 4423.44873046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.911251038312912, + "epoch": 0.00827966881324747, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003016560571268201, + "learning_rate": 1e-05, + "loss": 0.1006, + "num_tokens": 6433171.0, + "reward": 0.390625, + "reward_std": 0.3066929578781128, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999179840087891, + "sampling/importance_sampling_ratio/min": 0.005480794236063957, + "sampling/sampling_logp_difference/max": 5.206505298614502, + "sampling/sampling_logp_difference/mean": 0.017437148839235306, + "step": 9 + }, + { + "clip_ratio/high_max": 4.6329013457580004e-05, + "clip_ratio/high_mean": 1.1582253364395001e-05, + "clip_ratio/low_mean": 7.069455705277505e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 8.227681109929108e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13970.0, + "completions/mean_length": 4961.453125, + "completions/mean_terminated_length": 4687.31201171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.6808596402406693, + "epoch": 0.00919963201471941, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0035386616364121437, + "learning_rate": 1e-05, + "loss": 0.0596, + "num_tokens": 7085389.0, + "reward": 0.5625, + "reward_std": 0.3816363215446472, + "rewards/accuracy_reward/mean": 0.5625, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.0002734088629949838, + "sampling/sampling_logp_difference/max": 8.20454216003418, + "sampling/sampling_logp_difference/mean": 0.01566406339406967, + "step": 10 + }, + { + "clip_ratio/high_max": 2.43190661421977e-05, + "clip_ratio/high_mean": 6.079766535549425e-06, + "clip_ratio/low_mean": 2.2395396172214532e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8475162707763957e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14776.0, + "completions/mean_length": 4429.40625, + "completions/mean_terminated_length": 4335.275390625, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "entropy": 0.9181502386927605, + "epoch": 0.010119595216191352, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0022535293828696012, + "learning_rate": 1e-05, + "loss": 0.0031, + "num_tokens": 7672185.0, + "reward": 0.3671875, + "reward_std": 0.20357418060302734, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998801946640015, + "sampling/importance_sampling_ratio/min": 5.315856554943821e-08, + "sampling/sampling_logp_difference/max": 16.74998664855957, + "sampling/sampling_logp_difference/mean": 0.018429335206747055, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0117325928149512e-05, + "clip_ratio/high_mean": 2.529331482037378e-06, + "clip_ratio/low_mean": 1.1982813475697185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.45121450714214e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 5282.6796875, + "completions/mean_terminated_length": 5106.46875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 1.113751620054245, + "epoch": 0.011039558417663294, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0013591813622042537, + "learning_rate": 1e-05, + "loss": 0.0971, + "num_tokens": 8369000.0, + "reward": 0.3984375, + "reward_std": 0.3029736578464508, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 3.970265970565379e-05, + "sampling/sampling_logp_difference/max": 10.134092330932617, + "sampling/sampling_logp_difference/mean": 0.020221836864948273, + "step": 12 + }, + { + "clip_ratio/high_max": 5.411958227341529e-06, + "clip_ratio/high_mean": 1.3529895568353822e-06, + "clip_ratio/low_mean": 2.5284593846208736e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6637583516730956e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15925.0, + "completions/mean_length": 6970.421875, + "completions/mean_terminated_length": 6744.49609375, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 1.1721933633089066, + "epoch": 0.011959521619135235, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0024079051800072193, + "learning_rate": 1e-05, + "loss": 0.0713, + "num_tokens": 9283182.0, + "reward": 0.171875, + "reward_std": 0.17965975403785706, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999163746833801, + "sampling/importance_sampling_ratio/min": 0.0008915197686292231, + "sampling/sampling_logp_difference/max": 7.0225830078125, + "sampling/sampling_logp_difference/mean": 0.021462474018335342, + "step": 13 + }, + { + "clip_ratio/high_max": 2.0661535927501973e-05, + "clip_ratio/high_mean": 5.165383981875493e-06, + "clip_ratio/low_mean": 2.4304956298237812e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.947033948430544e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14658.0, + "completions/max_terminated_length": 14658.0, + "completions/mean_length": 4886.875, + "completions/mean_terminated_length": 4886.875, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 1.0108910650014877, + "epoch": 0.012879484820607176, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.002063734456896782, + "learning_rate": 1e-05, + "loss": 0.0386, + "num_tokens": 9928446.0, + "reward": 0.3515625, + "reward_std": 0.2409384697675705, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.0003672837920021266, + "sampling/sampling_logp_difference/max": 7.9093756675720215, + "sampling/sampling_logp_difference/mean": 0.01918785460293293, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.4761846993424115e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.4761846993424115e-06, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12992.0, + "completions/max_terminated_length": 12992.0, + "completions/mean_length": 4824.0078125, + "completions/mean_terminated_length": 4824.0078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 1.1070282831788063, + "epoch": 0.013799448022079117, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002424790756776929, + "learning_rate": 1e-05, + "loss": 0.0485, + "num_tokens": 10566415.0, + "reward": 0.28125, + "reward_std": 0.23698672652244568, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000125169754028, + "sampling/importance_sampling_ratio/min": 0.0011708867968991399, + "sampling/sampling_logp_difference/max": 6.749993801116943, + "sampling/sampling_logp_difference/mean": 0.02069389820098877, + "step": 15 + }, + { + "clip_ratio/high_max": 3.5075904634140898e-06, + "clip_ratio/high_mean": 8.768976158535224e-07, + "clip_ratio/low_mean": 2.2676964135825983e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.3553861751679506e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12685.0, + "completions/mean_length": 5449.4140625, + "completions/mean_terminated_length": 5363.31494140625, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "entropy": 0.9817888736724854, + "epoch": 0.014719411223551058, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021046048495918512, + "learning_rate": 1e-05, + "loss": 0.0252, + "num_tokens": 11281908.0, + "reward": 0.2265625, + "reward_std": 0.27168765664100647, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.013273254036903381, + "sampling/sampling_logp_difference/max": 4.322004318237305, + "sampling/sampling_logp_difference/mean": 0.019556276500225067, + "step": 16 + }, + { + "clip_ratio/high_max": 1.624216065465589e-05, + "clip_ratio/high_mean": 4.060540163663973e-06, + "clip_ratio/low_mean": 5.4349347919924185e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.840988796990132e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14133.0, + "completions/max_terminated_length": 14133.0, + "completions/mean_length": 5343.25, + "completions/mean_terminated_length": 5343.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 1.04741720110178, + "epoch": 0.015639374425023, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0035894038155674934, + "learning_rate": 1e-05, + "loss": 0.0584, + "num_tokens": 11987692.0, + "reward": 0.3359375, + "reward_std": 0.3124620020389557, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 2.1446165192173794e-05, + "sampling/sampling_logp_difference/max": 10.749964714050293, + "sampling/sampling_logp_difference/mean": 0.020530637353658676, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 4.272115029380075e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.272115029380075e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15138.0, + "completions/mean_length": 6301.9375, + "completions/mean_terminated_length": 5806.09814453125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.8892941772937775, + "epoch": 0.01655933762649494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0032246762420982122, + "learning_rate": 1e-05, + "loss": 0.0811, + "num_tokens": 12814244.0, + "reward": 0.3125, + "reward_std": 0.3606000542640686, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999184608459473, + "sampling/importance_sampling_ratio/min": 0.021351110190153122, + "sampling/sampling_logp_difference/max": 3.846651554107666, + "sampling/sampling_logp_difference/mean": 0.017541853711009026, + "step": 18 + }, + { + "clip_ratio/high_max": 9.956602298188955e-06, + "clip_ratio/high_mean": 2.4891505745472386e-06, + "clip_ratio/low_mean": 2.772165316855535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.0210803743102588e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16213.0, + "completions/max_terminated_length": 16213.0, + "completions/mean_length": 5297.46875, + "completions/mean_terminated_length": 5297.46875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.8097029253840446, + "epoch": 0.017479300827966882, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023969109170138836, + "learning_rate": 1e-05, + "loss": -0.0153, + "num_tokens": 13512520.0, + "reward": 0.359375, + "reward_std": 0.248829185962677, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222159385681, + "sampling/importance_sampling_ratio/min": 0.005766105372458696, + "sampling/sampling_logp_difference/max": 5.155758380889893, + "sampling/sampling_logp_difference/mean": 0.017464376986026764, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0098337497765897e-05, + "clip_ratio/high_mean": 2.524584374441474e-06, + "clip_ratio/low_mean": 3.173396362399217e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.425854845318099e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14655.0, + "completions/mean_length": 4890.34375, + "completions/mean_terminated_length": 4799.84228515625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.9267145916819572, + "epoch": 0.01839926402943882, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002759338356554508, + "learning_rate": 1e-05, + "loss": -0.0014, + "num_tokens": 14155556.0, + "reward": 0.3515625, + "reward_std": 0.31010788679122925, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999570250511169, + "sampling/importance_sampling_ratio/min": 0.008491010405123234, + "sampling/sampling_logp_difference/max": 4.768747329711914, + "sampling/sampling_logp_difference/mean": 0.018839433789253235, + "step": 20 + }, + { + "clip_ratio/high_max": 7.532389190600952e-06, + "clip_ratio/high_mean": 1.883097297650238e-06, + "clip_ratio/low_mean": 1.9051809317716106e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0934906729053182e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 16296.0, + "completions/max_terminated_length": 16296.0, + "completions/mean_length": 4609.40625, + "completions/mean_terminated_length": 4609.40625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 1.171089917421341, + "epoch": 0.019319227230910764, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0021055075339972973, + "learning_rate": 1e-05, + "loss": -0.0051, + "num_tokens": 14765328.0, + "reward": 0.2421875, + "reward_std": 0.2409384548664093, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999741911888123, + "sampling/importance_sampling_ratio/min": 5.368983693188056e-07, + "sampling/sampling_logp_difference/max": 14.437457084655762, + "sampling/sampling_logp_difference/mean": 0.020226795226335526, + "step": 21 + }, + { + "clip_ratio/high_max": 1.7169573766295798e-05, + "clip_ratio/high_mean": 4.2923934415739495e-06, + "clip_ratio/low_mean": 5.869748633813288e-06, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.0162142189074075e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14299.0, + "completions/mean_length": 5099.0390625, + "completions/mean_terminated_length": 5010.18115234375, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "entropy": 1.005959376692772, + "epoch": 0.020239190432382703, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0027595218271017075, + "learning_rate": 1e-05, + "loss": 0.0236, + "num_tokens": 15438549.0, + "reward": 0.296875, + "reward_std": 0.20069602131843567, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.00013984869292471558, + "sampling/sampling_logp_difference/max": 8.87494945526123, + "sampling/sampling_logp_difference/mean": 0.01902824640274048, + "step": 22 + }, + { + "clip_ratio/high_max": 5.162942670722259e-06, + "clip_ratio/high_mean": 1.2907356676805648e-06, + "clip_ratio/low_mean": 3.6872071063953626e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.816280593582633e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16204.0, + "completions/mean_length": 7138.0390625, + "completions/mean_terminated_length": 6839.7822265625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 1.0403362140059471, + "epoch": 0.021159153633854646, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002748022088780999, + "learning_rate": 1e-05, + "loss": 0.0647, + "num_tokens": 16373898.0, + "reward": 0.296875, + "reward_std": 0.3169426918029785, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999048709869385, + "sampling/importance_sampling_ratio/min": 0.0003802926803473383, + "sampling/sampling_logp_difference/max": 7.874569416046143, + "sampling/sampling_logp_difference/mean": 0.020853528752923012, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 5.6506045439164154e-05, + "clip_ratio/low_min": 5.709326615033206e-06, + "clip_ratio/region_mean": 5.6506045439164154e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14543.0, + "completions/mean_length": 5420.515625, + "completions/mean_terminated_length": 5334.18896484375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "entropy": 1.1339883506298065, + "epoch": 0.02207911683532659, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029502976685762405, + "learning_rate": 1e-05, + "loss": 0.0756, + "num_tokens": 17088156.0, + "reward": 0.1953125, + "reward_std": 0.25620076060295105, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999445676803589, + "sampling/importance_sampling_ratio/min": 9.70982582657598e-05, + "sampling/sampling_logp_difference/max": 9.239787101745605, + "sampling/sampling_logp_difference/mean": 0.0199423898011446, + "step": 24 + }, + { + "clip_ratio/high_max": 5.619998319161823e-06, + "clip_ratio/high_mean": 1.4049995797904558e-06, + "clip_ratio/low_mean": 6.439320418394345e-05, + "clip_ratio/low_min": 4.70632539872895e-06, + "clip_ratio/region_mean": 6.57982034226734e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14636.0, + "completions/mean_length": 5116.3046875, + "completions/mean_terminated_length": 4845.88037109375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.9503882825374603, + "epoch": 0.022999080036798528, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004891107324510813, + "learning_rate": 1e-05, + "loss": 0.0522, + "num_tokens": 17766619.0, + "reward": 0.3203125, + "reward_std": 0.3366856575012207, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999970018863678, + "sampling/importance_sampling_ratio/min": 0.0010618992382660508, + "sampling/sampling_logp_difference/max": 6.847696304321289, + "sampling/sampling_logp_difference/mean": 0.01914183795452118, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.839018643247982e-05, + "clip_ratio/low_min": 4.115091087442124e-06, + "clip_ratio/region_mean": 3.839018643247982e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14634.0, + "completions/mean_length": 5061.8671875, + "completions/mean_terminated_length": 4972.71630859375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 1.0540335327386856, + "epoch": 0.02391904323827047, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030373274348676205, + "learning_rate": 1e-05, + "loss": 0.0246, + "num_tokens": 18432938.0, + "reward": 0.34375, + "reward_std": 0.28118088841438293, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999624490737915, + "sampling/importance_sampling_ratio/min": 1.7212972807101323e-06, + "sampling/sampling_logp_difference/max": 13.272432327270508, + "sampling/sampling_logp_difference/mean": 0.019548218697309494, + "step": 26 + }, + { + "clip_ratio/high_max": 1.4656657867817557e-05, + "clip_ratio/high_mean": 4.665093399580655e-06, + "clip_ratio/low_mean": 3.751162262233265e-05, + "clip_ratio/low_min": 4.413062470121076e-06, + "clip_ratio/region_mean": 4.2176716192443564e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15782.0, + "completions/max_terminated_length": 15782.0, + "completions/mean_length": 6349.9765625, + "completions/mean_terminated_length": 6349.9765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.0268081277608871, + "epoch": 0.02483900643974241, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0017623496241867542, + "learning_rate": 1e-05, + "loss": 0.0011, + "num_tokens": 19264743.0, + "reward": 0.2734375, + "reward_std": 0.33903974294662476, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000061988830566, + "sampling/importance_sampling_ratio/min": 6.870362267363816e-05, + "sampling/sampling_logp_difference/max": 9.585708618164062, + "sampling/sampling_logp_difference/mean": 0.019106190651655197, + "step": 27 + }, + { + "clip_ratio/high_max": 9.221375876222737e-06, + "clip_ratio/high_mean": 2.3053439690556843e-06, + "clip_ratio/low_mean": 3.09787185415189e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.328406273794826e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15944.0, + "completions/mean_length": 5815.484375, + "completions/mean_terminated_length": 5561.84033203125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "entropy": 1.0389493256807327, + "epoch": 0.025758969641214352, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.003111837198957801, + "learning_rate": 1e-05, + "loss": -0.0162, + "num_tokens": 20030109.0, + "reward": 0.34375, + "reward_std": 0.32719242572784424, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000298023223877, + "sampling/importance_sampling_ratio/min": 0.02987043187022209, + "sampling/sampling_logp_difference/max": 3.5108861923217773, + "sampling/sampling_logp_difference/mean": 0.020060991868376732, + "step": 28 + }, + { + "clip_ratio/high_max": 6.7810142354574054e-06, + "clip_ratio/high_mean": 1.6952535588643514e-06, + "clip_ratio/low_mean": 4.474762545214617e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.644287901101052e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15371.0, + "completions/mean_length": 5157.1484375, + "completions/mean_terminated_length": 5068.748046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 1.0510126948356628, + "epoch": 0.02667893284268629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003041633637621999, + "learning_rate": 1e-05, + "loss": 0.0471, + "num_tokens": 20710904.0, + "reward": 0.3125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.4653336703777313, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999587535858154, + "sampling/importance_sampling_ratio/min": 0.04357198625802994, + "sampling/sampling_logp_difference/max": 3.133340835571289, + "sampling/sampling_logp_difference/mean": 0.019007597118616104, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.0962848566341563e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.0962848566341563e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15333.0, + "completions/max_terminated_length": 15333.0, + "completions/mean_length": 4446.3828125, + "completions/mean_terminated_length": 4446.3828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 1.053279548883438, + "epoch": 0.027598896044158234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022369560319930315, + "learning_rate": 1e-05, + "loss": -0.001, + "num_tokens": 21298497.0, + "reward": 0.390625, + "reward_std": 0.24169495701789856, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4898075461387634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998750686645508, + "sampling/importance_sampling_ratio/min": 0.006704842206090689, + "sampling/sampling_logp_difference/max": 5.00492525100708, + "sampling/sampling_logp_difference/mean": 0.01947362720966339, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.8460265411922592e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8460265411922592e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15386.0, + "completions/mean_length": 6294.1484375, + "completions/mean_terminated_length": 6133.9921875, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 1.2036212533712387, + "epoch": 0.028518859245630176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0021383841522037983, + "learning_rate": 1e-05, + "loss": 0.033, + "num_tokens": 22124812.0, + "reward": 0.171875, + "reward_std": 0.20752590894699097, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999858736991882, + "sampling/importance_sampling_ratio/min": 3.9575263599544996e-07, + "sampling/sampling_logp_difference/max": 14.742476463317871, + "sampling/sampling_logp_difference/mean": 0.022367021068930626, + "step": 31 + }, + { + "clip_ratio/high_max": 1.73864664247958e-05, + "clip_ratio/high_mean": 4.34661660619895e-06, + "clip_ratio/low_mean": 3.19569651310303e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.630358173722925e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14893.0, + "completions/mean_length": 6011.4921875, + "completions/mean_terminated_length": 5929.81884765625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.123318687081337, + "epoch": 0.029438822447102116, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.00126531848218292, + "learning_rate": 1e-05, + "loss": 0.0119, + "num_tokens": 22915091.0, + "reward": 0.171875, + "reward_std": 0.2330477386713028, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999861121177673, + "sampling/importance_sampling_ratio/min": 1.6368276192224585e-05, + "sampling/sampling_logp_difference/max": 11.02016544342041, + "sampling/sampling_logp_difference/mean": 0.019905246794223785, + "step": 32 + }, + { + "clip_ratio/high_max": 2.8753217975463485e-05, + "clip_ratio/high_mean": 7.188304493865871e-06, + "clip_ratio/low_mean": 3.818478444372886e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.537308905128157e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16332.0, + "completions/mean_length": 5152.46875, + "completions/mean_terminated_length": 5064.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 1.0477670058608055, + "epoch": 0.03035878564857406, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0030069497879594564, + "learning_rate": 1e-05, + "loss": 0.1026, + "num_tokens": 23596487.0, + "reward": 0.3359375, + "reward_std": 0.29142576456069946, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 9.009604013954231e-07, + "sampling/sampling_logp_difference/max": 13.919804573059082, + "sampling/sampling_logp_difference/mean": 0.019003981724381447, + "step": 33 + }, + { + "clip_ratio/high_max": 3.069575450354023e-05, + "clip_ratio/high_mean": 7.673938625885057e-06, + "clip_ratio/low_mean": 3.4847614415411954e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.252155258654966e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 12792.0, + "completions/max_terminated_length": 12792.0, + "completions/mean_length": 4672.5703125, + "completions/mean_terminated_length": 4672.5703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.9471446052193642, + "epoch": 0.031278748850046, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002676331205293536, + "learning_rate": 1e-05, + "loss": 0.0724, + "num_tokens": 24213408.0, + "reward": 0.3203125, + "reward_std": 0.2988021969795227, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000251531600952, + "sampling/importance_sampling_ratio/min": 0.0013351094676181674, + "sampling/sampling_logp_difference/max": 6.618741989135742, + "sampling/sampling_logp_difference/mean": 0.0179576613008976, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.6127243245355203e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6127243245355203e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16108.0, + "completions/mean_length": 7013.734375, + "completions/mean_terminated_length": 6711.4677734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "entropy": 1.1254516392946243, + "epoch": 0.03219871205151794, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0023615453392267227, + "learning_rate": 1e-05, + "loss": 0.0384, + "num_tokens": 25130262.0, + "reward": 0.1953125, + "reward_std": 0.26485776901245117, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999954342842102, + "sampling/importance_sampling_ratio/min": 6.6197676460433286e-06, + "sampling/sampling_logp_difference/max": 11.925450325012207, + "sampling/sampling_logp_difference/mean": 0.0215257927775383, + "step": 35 + }, + { + "clip_ratio/high_max": 4.06954040954588e-06, + "clip_ratio/high_mean": 1.01738510238647e-06, + "clip_ratio/low_mean": 4.180071573500754e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.281810015527299e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15673.0, + "completions/mean_length": 5858.59375, + "completions/mean_terminated_length": 5605.984375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 1.0713739022612572, + "epoch": 0.03311867525298988, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0029018481727689505, + "learning_rate": 1e-05, + "loss": 0.1041, + "num_tokens": 25898194.0, + "reward": 0.3671875, + "reward_std": 0.29036980867385864, + "rewards/accuracy_reward/mean": 0.3671875, + "rewards/accuracy_reward/std": 0.4839322865009308, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999915957450867, + "sampling/importance_sampling_ratio/min": 1.6834765119710937e-05, + "sampling/sampling_logp_difference/max": 10.992064476013184, + "sampling/sampling_logp_difference/mean": 0.019959844648838043, + "step": 36 + }, + { + "clip_ratio/high_max": 1.2810827229259303e-05, + "clip_ratio/high_mean": 3.2027068073148257e-06, + "clip_ratio/low_mean": 3.29701083501277e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.617281504375569e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14004.0, + "completions/mean_length": 6952.6015625, + "completions/mean_terminated_length": 6726.24853515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.028619796037674, + "epoch": 0.03403863845446182, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0022342968732118607, + "learning_rate": 1e-05, + "loss": 0.0637, + "num_tokens": 26812791.0, + "reward": 0.234375, + "reward_std": 0.26827272772789, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999532699584961, + "sampling/importance_sampling_ratio/min": 4.540153167909011e-05, + "sampling/sampling_logp_difference/max": 9.999964714050293, + "sampling/sampling_logp_difference/mean": 0.02002539485692978, + "step": 37 + }, + { + "clip_ratio/high_max": 1.5225089100567857e-05, + "clip_ratio/high_mean": 6.960676159906143e-06, + "clip_ratio/low_mean": 4.09088329433871e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.7869508762232726e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16361.0, + "completions/mean_length": 6413.421875, + "completions/mean_terminated_length": 6174.12841796875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 0.9452399462461472, + "epoch": 0.034958601655933765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0021800603717565536, + "learning_rate": 1e-05, + "loss": 0.0275, + "num_tokens": 27652757.0, + "reward": 0.296875, + "reward_std": 0.31246688961982727, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999439120292664, + "sampling/importance_sampling_ratio/min": 3.895394547726028e-05, + "sampling/sampling_logp_difference/max": 10.153130531311035, + "sampling/sampling_logp_difference/mean": 0.019722118973731995, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.9564903318023426e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.9564903318023426e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15754.0, + "completions/max_terminated_length": 15754.0, + "completions/mean_length": 5176.3515625, + "completions/mean_terminated_length": 5176.3515625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 1.0444758981466293, + "epoch": 0.035878564857405704, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.004153470974415541, + "learning_rate": 1e-05, + "loss": 0.0798, + "num_tokens": 28334386.0, + "reward": 0.2734375, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.007421077694743872, + "sampling/sampling_logp_difference/max": 4.903430938720703, + "sampling/sampling_logp_difference/mean": 0.020159056410193443, + "step": 39 + }, + { + "clip_ratio/high_max": 1.725743459246587e-05, + "clip_ratio/high_mean": 4.3143586481164675e-06, + "clip_ratio/low_mean": 2.0204584302518924e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.451894306432223e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15554.0, + "completions/mean_length": 5178.9921875, + "completions/mean_terminated_length": 5001.13525390625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.0803537145256996, + "epoch": 0.03679852805887764, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002477057045325637, + "learning_rate": 1e-05, + "loss": 0.0067, + "num_tokens": 29017145.0, + "reward": 0.2890625, + "reward_std": 0.29932135343551636, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497102737427, + "sampling/importance_sampling_ratio/min": 0.004630985204130411, + "sampling/sampling_logp_difference/max": 5.374985694885254, + "sampling/sampling_logp_difference/mean": 0.019826076924800873, + "step": 40 + }, + { + "clip_ratio/high_max": 1.6637992303003557e-05, + "clip_ratio/high_mean": 4.159498075750889e-06, + "clip_ratio/low_mean": 2.1970684144889674e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.6130182106953725e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14131.0, + "completions/max_terminated_length": 14131.0, + "completions/mean_length": 4980.359375, + "completions/mean_terminated_length": 4980.359375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.9510642662644386, + "epoch": 0.03771849126034959, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0016275218222290277, + "learning_rate": 1e-05, + "loss": -0.0097, + "num_tokens": 29673535.0, + "reward": 0.4375, + "reward_std": 0.26249876618385315, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999750852584839, + "sampling/importance_sampling_ratio/min": 0.000599516904912889, + "sampling/sampling_logp_difference/max": 7.419386386871338, + "sampling/sampling_logp_difference/mean": 0.01844976656138897, + "step": 41 + }, + { + "clip_ratio/high_max": 2.8087193186365766e-05, + "clip_ratio/high_mean": 7.021798296591442e-06, + "clip_ratio/low_mean": 3.9683913541921356e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.670571286169434e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15328.0, + "completions/mean_length": 5778.6953125, + "completions/mean_terminated_length": 5695.18896484375, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "entropy": 1.0413239300251007, + "epoch": 0.03863845446182153, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001847646082751453, + "learning_rate": 1e-05, + "loss": -0.0045, + "num_tokens": 30436416.0, + "reward": 0.2578125, + "reward_std": 0.33903977274894714, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998501539230347, + "sampling/importance_sampling_ratio/min": 0.00020348970429040492, + "sampling/sampling_logp_difference/max": 8.499895095825195, + "sampling/sampling_logp_difference/mean": 0.021502099931240082, + "step": 42 + }, + { + "clip_ratio/high_max": 2.68402091023745e-05, + "clip_ratio/high_mean": 8.575278570788214e-06, + "clip_ratio/low_mean": 4.547183698377921e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 5.404711600931478e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14182.0, + "completions/max_terminated_length": 14182.0, + "completions/mean_length": 4875.125, + "completions/mean_terminated_length": 4875.125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 1.0464690178632736, + "epoch": 0.03955841766329347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021134833805263042, + "learning_rate": 1e-05, + "loss": 0.0727, + "num_tokens": 31083672.0, + "reward": 0.40625, + "reward_std": 0.3584783971309662, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999340176582336, + "sampling/importance_sampling_ratio/min": 0.012113225646317005, + "sampling/sampling_logp_difference/max": 4.41345739364624, + "sampling/sampling_logp_difference/mean": 0.019140049815177917, + "step": 43 + }, + { + "clip_ratio/high_max": 3.9877967992651975e-05, + "clip_ratio/high_mean": 9.969491998162994e-06, + "clip_ratio/low_mean": 3.981287841270387e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.9782369273998484e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15959.0, + "completions/mean_length": 4691.421875, + "completions/mean_terminated_length": 4505.82568359375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 1.0229775309562683, + "epoch": 0.040478380864765406, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037735572550445795, + "learning_rate": 1e-05, + "loss": 0.0603, + "num_tokens": 31703654.0, + "reward": 0.4453125, + "reward_std": 0.2993389964103699, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999492168426514, + "sampling/importance_sampling_ratio/min": 0.03150063753128052, + "sampling/sampling_logp_difference/max": 3.457747459411621, + "sampling/sampling_logp_difference/mean": 0.01912039890885353, + "step": 44 + }, + { + "clip_ratio/high_max": 3.5441889849607833e-06, + "clip_ratio/high_mean": 8.860472462401958e-07, + "clip_ratio/low_mean": 1.5137359810069029e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 1.6023407056309225e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15605.0, + "completions/mean_length": 6821.96875, + "completions/mean_terminated_length": 6592.48046875, + "completions/min_length": 1196.0, + "completions/min_terminated_length": 1196.0, + "entropy": 1.1132484003901482, + "epoch": 0.04139834406623735, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.0010448681423440576, + "learning_rate": 1e-05, + "loss": 0.022, + "num_tokens": 32599778.0, + "reward": 0.2265625, + "reward_std": 0.1814819872379303, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999915361404419, + "sampling/importance_sampling_ratio/min": 0.006500681862235069, + "sampling/sampling_logp_difference/max": 5.035848140716553, + "sampling/sampling_logp_difference/mean": 0.02125459350645542, + "step": 45 + }, + { + "clip_ratio/high_max": 4.652893949241843e-06, + "clip_ratio/high_mean": 1.1632234873104608e-06, + "clip_ratio/low_mean": 5.731516603191267e-05, + "clip_ratio/low_min": 9.891066838463303e-06, + "clip_ratio/region_mean": 5.8478389746596804e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15753.0, + "completions/mean_length": 6834.3671875, + "completions/mean_terminated_length": 6605.17626953125, + "completions/min_length": 624.0, + "completions/min_terminated_length": 624.0, + "entropy": 0.9827468693256378, + "epoch": 0.04231830726770929, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0017670176457613707, + "learning_rate": 1e-05, + "loss": 0.1105, + "num_tokens": 33492737.0, + "reward": 0.3046875, + "reward_std": 0.3440523147583008, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999089241027832, + "sampling/importance_sampling_ratio/min": 0.0021202093921601772, + "sampling/sampling_logp_difference/max": 6.156240463256836, + "sampling/sampling_logp_difference/mean": 0.019490526989102364, + "step": 46 + }, + { + "clip_ratio/high_max": 6.717360520269722e-06, + "clip_ratio/high_mean": 2.503530367903295e-06, + "clip_ratio/low_mean": 2.5672919832686603e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.8176450200589898e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14098.0, + "completions/mean_length": 6175.296875, + "completions/mean_terminated_length": 5845.98388671875, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 1.1584237962961197, + "epoch": 0.04323827046918123, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.0016891945851966739, + "learning_rate": 1e-05, + "loss": -0.0008, + "num_tokens": 34312455.0, + "reward": 0.1875, + "reward_std": 0.19673937559127808, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 8.086384332273155e-05, + "sampling/sampling_logp_difference/max": 9.422743797302246, + "sampling/sampling_logp_difference/mean": 0.021749887615442276, + "step": 47 + }, + { + "clip_ratio/high_max": 2.2362002255249536e-05, + "clip_ratio/high_mean": 8.189798336388776e-06, + "clip_ratio/low_mean": 2.1058204993096297e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.9248002192616696e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16054.0, + "completions/mean_length": 6036.8359375, + "completions/mean_terminated_length": 5955.3623046875, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.9301538467407227, + "epoch": 0.04415823367065318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.003834392176941037, + "learning_rate": 1e-05, + "loss": 0.0636, + "num_tokens": 35102738.0, + "reward": 0.4375, + "reward_std": 0.36614155769348145, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.00013992394087836146, + "sampling/sampling_logp_difference/max": 8.874411582946777, + "sampling/sampling_logp_difference/mean": 0.019147861748933792, + "step": 48 + }, + { + "clip_ratio/high_max": 1.1501961580506759e-05, + "clip_ratio/high_mean": 2.8754903951266897e-06, + "clip_ratio/low_mean": 4.08189714562468e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.369446196506033e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15594.0, + "completions/mean_length": 6262.46875, + "completions/mean_terminated_length": 5764.68798828125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.8599015846848488, + "epoch": 0.045078196872125116, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.0029804729856550694, + "learning_rate": 1e-05, + "loss": 0.0495, + "num_tokens": 35924886.0, + "reward": 0.3984375, + "reward_std": 0.3911295533180237, + "rewards/accuracy_reward/mean": 0.3984375, + "rewards/accuracy_reward/std": 0.4915000796318054, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999922513961792, + "sampling/importance_sampling_ratio/min": 0.00021375219512265176, + "sampling/sampling_logp_difference/max": 9.904524803161621, + "sampling/sampling_logp_difference/mean": 0.01815103553235531, + "step": 49 + }, + { + "clip_ratio/high_max": 2.4107544049911667e-05, + "clip_ratio/high_mean": 6.026886012477917e-06, + "clip_ratio/low_mean": 3.6588148361715866e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.261503391944643e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14556.0, + "completions/max_terminated_length": 14556.0, + "completions/mean_length": 5926.8984375, + "completions/mean_terminated_length": 5926.8984375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 1.0042993426322937, + "epoch": 0.045998160073597055, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0022071697749197483, + "learning_rate": 1e-05, + "loss": 0.0059, + "num_tokens": 36700913.0, + "reward": 0.3359375, + "reward_std": 0.3306073546409607, + "rewards/accuracy_reward/mean": 0.3359375, + "rewards/accuracy_reward/std": 0.47417303919792175, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000010371208191, + "sampling/importance_sampling_ratio/min": 0.0005220364546403289, + "sampling/sampling_logp_difference/max": 7.557773113250732, + "sampling/sampling_logp_difference/mean": 0.01954064890742302, + "step": 50 + }, + { + "clip_ratio/high_max": 4.9106265578302555e-06, + "clip_ratio/high_mean": 1.2276566394575639e-06, + "clip_ratio/low_mean": 2.634599570683349e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7573652346291055e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15217.0, + "completions/mean_length": 6873.6875, + "completions/mean_terminated_length": 6645.4404296875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 1.0255412608385086, + "epoch": 0.046918123275068994, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.002320924773812294, + "learning_rate": 1e-05, + "loss": 0.0508, + "num_tokens": 37604865.0, + "reward": 0.234375, + "reward_std": 0.3135228157043457, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.026153141632676125, + "sampling/sampling_logp_difference/max": 3.6437859535217285, + "sampling/sampling_logp_difference/mean": 0.019532475620508194, + "step": 51 + }, + { + "clip_ratio/high_max": 1.6350510122720152e-05, + "clip_ratio/high_mean": 4.087627530680038e-06, + "clip_ratio/low_mean": 2.351988746340794e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7607515221461654e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15668.0, + "completions/mean_length": 6073.8984375, + "completions/mean_terminated_length": 5992.71630859375, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 1.0713753998279572, + "epoch": 0.04783808647654094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.002212709980085492, + "learning_rate": 1e-05, + "loss": 0.0668, + "num_tokens": 38405196.0, + "reward": 0.359375, + "reward_std": 0.22119548916816711, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 8.706459084351081e-06, + "sampling/sampling_logp_difference/max": 11.651445388793945, + "sampling/sampling_logp_difference/mean": 0.021252838894724846, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 3.729486718384578e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 3.729486718384578e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15299.0, + "completions/mean_length": 5838.71875, + "completions/mean_terminated_length": 5671.33349609375, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "entropy": 1.021155133843422, + "epoch": 0.04875804967801288, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.001135052996687591, + "learning_rate": 1e-05, + "loss": 0.0178, + "num_tokens": 39171704.0, + "reward": 0.28125, + "reward_std": 0.23410367965698242, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999173879623413, + "sampling/importance_sampling_ratio/min": 0.003084881929680705, + "sampling/sampling_logp_difference/max": 5.7812418937683105, + "sampling/sampling_logp_difference/mean": 0.020781882107257843, + "step": 53 + }, + { + "clip_ratio/high_max": 1.7124169744420215e-05, + "clip_ratio/high_mean": 4.281042436105054e-06, + "clip_ratio/low_mean": 3.706903294187214e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.135007543482061e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14617.0, + "completions/max_terminated_length": 14617.0, + "completions/mean_length": 6358.5859375, + "completions/mean_terminated_length": 6358.5859375, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.9720487147569656, + "epoch": 0.04967801287948482, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002638082252815366, + "learning_rate": 1e-05, + "loss": 0.0145, + "num_tokens": 40003859.0, + "reward": 0.40625, + "reward_std": 0.3174618184566498, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.4930621087551117, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000380277633667, + "sampling/importance_sampling_ratio/min": 0.01960253342986107, + "sampling/sampling_logp_difference/max": 3.932096481323242, + "sampling/sampling_logp_difference/mean": 0.01991666667163372, + "step": 54 + }, + { + "clip_ratio/high_max": 6.55582925901399e-06, + "clip_ratio/high_mean": 2.994117721755174e-06, + "clip_ratio/low_mean": 2.222621503733535e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.5220332759090525e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 14753.0, + "completions/max_terminated_length": 14753.0, + "completions/mean_length": 4634.1875, + "completions/mean_terminated_length": 4634.1875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.9715309366583824, + "epoch": 0.050597976080956765, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.001994960242882371, + "learning_rate": 1e-05, + "loss": 0.0221, + "num_tokens": 40616483.0, + "reward": 0.4375, + "reward_std": 0.29644322395324707, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000698566436768, + "sampling/importance_sampling_ratio/min": 1.0510009815334342e-05, + "sampling/sampling_logp_difference/max": 11.46318244934082, + "sampling/sampling_logp_difference/mean": 0.01902047172188759, + "step": 55 + }, + { + "clip_ratio/high_max": 2.2474248908110894e-05, + "clip_ratio/high_mean": 7.571314540655294e-06, + "clip_ratio/low_mean": 4.3583780325207044e-05, + "clip_ratio/low_min": 4.6013396968191955e-06, + "clip_ratio/region_mean": 5.1155094070054474e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15953.0, + "completions/mean_length": 6596.25, + "completions/mean_terminated_length": 6361.34423828125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.8207943215966225, + "epoch": 0.051517939282428704, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019902780186384916, + "learning_rate": 1e-05, + "loss": 0.0506, + "num_tokens": 41484443.0, + "reward": 0.4453125, + "reward_std": 0.326668381690979, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 7.485233072657138e-05, + "sampling/sampling_logp_difference/max": 9.499993324279785, + "sampling/sampling_logp_difference/mean": 0.018301833420991898, + "step": 56 + }, + { + "clip_ratio/high_max": 3.0019932637515012e-06, + "clip_ratio/high_mean": 7.504983159378753e-07, + "clip_ratio/low_mean": 4.332785601945943e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.407835376696312e-05, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15834.0, + "completions/mean_length": 6785.75, + "completions/mean_terminated_length": 6313.70458984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.9876058474183083, + "epoch": 0.05243790248390064, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0015235114842653275, + "learning_rate": 1e-05, + "loss": 0.0128, + "num_tokens": 42372235.0, + "reward": 0.2421875, + "reward_std": 0.325075626373291, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999551773071289, + "sampling/importance_sampling_ratio/min": 0.026679370552301407, + "sampling/sampling_logp_difference/max": 3.6238646507263184, + "sampling/sampling_logp_difference/mean": 0.019945615902543068, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 2.1349006601667497e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.1349006601667497e-05, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14726.0, + "completions/mean_length": 4881.2109375, + "completions/mean_terminated_length": 4510.1533203125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.989942155778408, + "epoch": 0.05335786568537258, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002033712575212121, + "learning_rate": 1e-05, + "loss": 0.1088, + "num_tokens": 43015238.0, + "reward": 0.4375, + "reward_std": 0.2869548797607422, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.49802759289741516, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300407409668, + "sampling/importance_sampling_ratio/min": 0.0001238943514181301, + "sampling/sampling_logp_difference/max": 8.996081352233887, + "sampling/sampling_logp_difference/mean": 0.01887543685734272, + "step": 58 + }, + { + "clip_ratio/high_max": 2.584004687378183e-05, + "clip_ratio/high_mean": 6.4600117184454575e-06, + "clip_ratio/low_mean": 2.1371045761497953e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.7831058105221018e-05, + "completions/clipped_ratio": 0.0, + "completions/max_length": 15001.0, + "completions/max_terminated_length": 15001.0, + "completions/mean_length": 4725.3984375, + "completions/mean_terminated_length": 4725.3984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 1.0350637435913086, + "epoch": 0.05427782888684453, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0030296226032078266, + "learning_rate": 1e-05, + "loss": 0.0691, + "num_tokens": 43637737.0, + "reward": 0.4453125, + "reward_std": 0.32035762071609497, + "rewards/accuracy_reward/mean": 0.4453125, + "rewards/accuracy_reward/std": 0.4989531338214874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999939203262329, + "sampling/importance_sampling_ratio/min": 0.00022932067804504186, + "sampling/sampling_logp_difference/max": 8.380389213562012, + "sampling/sampling_logp_difference/mean": 0.01995944231748581, + "step": 59 + }, + { + "clip_ratio/high_max": 1.994733975152485e-05, + "clip_ratio/high_mean": 4.986834937881213e-06, + "clip_ratio/low_mean": 3.5168303838872816e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.015513832200668e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 4918.171875, + "completions/mean_terminated_length": 4736.1748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.965274304151535, + "epoch": 0.05519779208831647, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.002758471528068185, + "learning_rate": 1e-05, + "loss": 0.0845, + "num_tokens": 44285327.0, + "reward": 0.328125, + "reward_std": 0.27328526973724365, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999663233757019, + "sampling/importance_sampling_ratio/min": 0.010958661325275898, + "sampling/sampling_logp_difference/max": 4.513625144958496, + "sampling/sampling_logp_difference/mean": 0.019083233550190926, + "step": 60 + }, + { + "clip_ratio/high_max": 1.0621563887980301e-05, + "clip_ratio/high_mean": 2.6553909719950752e-06, + "clip_ratio/low_mean": 3.838553107016196e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.1040922042157035e-05, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15031.0, + "completions/mean_length": 4998.2890625, + "completions/mean_terminated_length": 4908.6376953125, + "completions/min_length": 524.0, + "completions/min_terminated_length": 524.0, + "entropy": 0.9200445115566254, + "epoch": 0.05611775528978841, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.0027611786499619484, + "learning_rate": 1e-05, + "loss": 0.0575, + "num_tokens": 44944356.0, + "reward": 0.3515625, + "reward_std": 0.3895368278026581, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999884366989136, + "sampling/importance_sampling_ratio/min": 0.0018651526188477874, + "sampling/sampling_logp_difference/max": 6.284412384033203, + "sampling/sampling_logp_difference/mean": 0.017853498458862305, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0136624496226432e-05, + "clip_ratio/high_mean": 2.534156124056608e-06, + "clip_ratio/low_mean": 2.0260404085092887e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 2.2794560095462657e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16110.0, + "completions/mean_length": 6290.1796875, + "completions/mean_terminated_length": 6129.96044921875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.9360214695334435, + "epoch": 0.05703771849126035, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0015557854203507304, + "learning_rate": 1e-05, + "loss": 0.0111, + "num_tokens": 45767867.0, + "reward": 0.34375, + "reward_std": 0.30168038606643677, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.47682511806488037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999427795410156, + "sampling/importance_sampling_ratio/min": 0.0011004531988874078, + "sampling/sampling_logp_difference/max": 6.812033176422119, + "sampling/sampling_logp_difference/mean": 0.0200855303555727, + "step": 62 + }, + { + "clip_ratio/high_max": 2.2559511307918e-06, + "clip_ratio/high_mean": 5.6398778269795e-07, + "clip_ratio/low_mean": 4.51761221711422e-05, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 4.574010984015331e-05, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16366.0, + "completions/mean_length": 6486.15625, + "completions/mean_terminated_length": 6248.6083984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.863138921558857, + "epoch": 0.05795768169273229, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0026953541673719883, + "learning_rate": 1e-05, + "loss": -0.0194, + "num_tokens": 46618575.0, + "reward": 0.2578125, + "reward_std": 0.2580180764198303, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999406337738037, + "sampling/importance_sampling_ratio/min": 0.0011708897072821856, + "sampling/sampling_logp_difference/max": 6.749991416931152, + "sampling/sampling_logp_difference/mean": 0.01863238587975502, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0073357771034352e-05, + "clip_ratio/high_mean": 2.518339442758588e-06, + "clip_ratio/low_mean": 2.787370635815023e-05, + "clip_ratio/low_min": 3.837534222839167e-06, + "clip_ratio/region_mean": 3.0392045573535142e-05, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16010.0, + "completions/mean_length": 6442.7734375, + "completions/mean_terminated_length": 6284.9765625, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 1.0242054909467697, + "epoch": 0.05887764489420423, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0024442619178444147, + "learning_rate": 1e-05, + "loss": 0.0569, + "num_tokens": 47462274.0, + "reward": 0.328125, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4713755249977112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998892545700073, + "sampling/importance_sampling_ratio/min": 4.9445447736218284e-09, + "sampling/sampling_logp_difference/max": 19.124980926513672, + "sampling/sampling_logp_difference/mean": 0.019810764119029045, + "step": 64 + } + ], + "logging_steps": 1, + "max_steps": 1024, + "num_input_tokens_seen": 47462274, + "num_train_epochs": 1, + "save_steps": 64, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/dapo_lora_plus_20251202_001141/output.log b/dapo_lora_plus_20251202_001141/output.log new file mode 100644 index 0000000000000000000000000000000000000000..a5a9eaf66ed4b3f66a8b5d1772df9fd85210dee8 --- /dev/null +++ b/dapo_lora_plus_20251202_001141/output.log @@ -0,0 +1,6407 @@ +W1202 00:12:01.875000 96731 torch/distributed/run.py:774] +W1202 00:12:01.875000 96731 torch/distributed/run.py:774] ***************************************** +W1202 00:12:01.875000 96731 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W1202 00:12:01.875000 96731 torch/distributed/run.py:774] ***************************************** +INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 00:12:24 [__init__.py:216] Automatically detected platform cuda. +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000))TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) + +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lora_plus', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lora_plus_20251202_001141', run_name='outputs/dapo_lora_plus_20251202_001141', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it +[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it +[OpenTinker] 2025-12-02 00:12:29,815 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it +[OpenTinker] 2025-12-02 00:12:29,816 - root - INFO - Output directory outputs/dapo_lora_plus_20251202_001141 already exists, using it +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: setting up run v6m8ctck +wandb: setting up run 3kl3jf3g +wandb: setting up run l45ioj2b +wandb: setting up run naqjbub0 +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-naqjbub0 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lora_plus_20251202_001141 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/naqjbub0 +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-l45ioj2b +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lora_plus_20251202_001141 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/l45ioj2b +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +[OpenTinker] 2025-12-02 00:12:35,516 - root - INFO - Wandb initialized successfully +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 00:12:35,517 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-3kl3jf3g +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lora_plus_20251202_001141 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/3kl3jf3g +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_001233-v6m8ctck +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lora_plus_20251202_001141 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/v6m8ctck +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 00:12:35,809 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 00:12:35,809 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 00:12:35,981 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 00:12:35,981 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 00:12:36,844 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 00:12:36,872 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 00:12:37,011 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 00:12:37,195 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 00:12:40,047 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 00:12:40,076 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 00:12:40,102 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 00:12:40,222 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-12-02 00:12:41,300 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 00:12:41,300 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 00:12:41,324 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 00:12:41,324 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 00:12:41,467 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 00:12:41,467 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 00:12:41,475 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 00:12:41,475 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 00:12:41,729 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 00:12:41,767 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 00:12:41,831 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 00:12:41,832 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 00:12:42,231 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpye19kupf/test.c -o /tmp/tmpye19kupf/test.o +[OpenTinker] 2025-12-02 00:12:42,231 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpkzj37tn0/test.c -o /tmp/tmpkzj37tn0/test.o +[OpenTinker] 2025-12-02 00:12:42,246 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmppq3_j2pr/test.c -o /tmp/tmppq3_j2pr/test.o +[OpenTinker] 2025-12-02 00:12:42,246 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp1tdyxj41/test.c -o /tmp/tmp1tdyxj41/test.o +[OpenTinker] 2025-12-02 00:12:42,261 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpye19kupf/test.o -laio -o /tmp/tmpye19kupf/a.out +[OpenTinker] 2025-12-02 00:12:42,275 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpkzj37tn0/test.o -laio -o /tmp/tmpkzj37tn0/a.out +[OpenTinker] 2025-12-02 00:12:42,290 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmppq3_j2pr/test.o -laio -o /tmp/tmppq3_j2pr/a.out +[OpenTinker] 2025-12-02 00:12:42,305 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp1tdyxj41/test.o -laio -o /tmp/tmp1tdyxj41/a.out +[OpenTinker] 2025-12-02 00:12:42,789 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp4vltduv2/test.c -o /tmp/tmp4vltduv2/test.o +[OpenTinker] 2025-12-02 00:12:42,802 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpgumyw8zr/test.c -o /tmp/tmpgumyw8zr/test.o +[OpenTinker] 2025-12-02 00:12:42,803 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpeqggqm0g/test.c -o /tmp/tmpeqggqm0g/test.o +[OpenTinker] 2025-12-02 00:12:42,830 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpz4mi1igy/test.c -o /tmp/tmpz4mi1igy/test.o +[OpenTinker] 2025-12-02 00:12:42,846 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp4vltduv2/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp4vltduv2/a.out +[OpenTinker] 2025-12-02 00:12:42,862 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpeqggqm0g/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpeqggqm0g/a.out +[OpenTinker] 2025-12-02 00:12:42,875 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpgumyw8zr/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpgumyw8zr/a.out +[OpenTinker] 2025-12-02 00:12:42,886 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpz4mi1igy/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpz4mi1igy/a.out +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO cudaDriverVersion 12090 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Bootstrap: Using eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.224.17<0> +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Initialized NET plugin Socket +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO ncclCommInitRankConfig comm 0x1dbf3b00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x67adbc2d692fd1c8 - Init START +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO ncclCommInitRankConfig comm 0x1df75b30 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x67adbc2d692fd1c8 - Init START +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e24ba00 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x67adbc2d692fd1c8 - Init START +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1d4810 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x67adbc2d692fd1c8 - Init START +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Bootstrap timings total 0.002802 (create 0.000019, send 0.000086, recv 0.002229, ring 0.000169, delay 0.000000) +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Bootstrap timings total 0.002651 (create 0.000020, send 0.000086, recv 0.000098, ring 0.001236, delay 0.000001) +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Bootstrap timings total 0.001716 (create 0.000020, send 0.000107, recv 0.000044, ring 0.000116, delay 0.000000) +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Bootstrap timings total 0.000717 (create 0.000021, send 0.000095, recv 0.000166, ring 0.000061, delay 0.000001) +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO comm 0x1e24ba00 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO comm 0x1df75b30 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO comm 0x1f1d4810 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO comm 0x1dbf3b00 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:96975:98060 [0] NCCL INFO [Proxy Service] Device 0 CPU core 21 +lshn-qs-g2ri-2:96975:98061 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 24 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:96978:98062 [3] NCCL INFO [Proxy Service] Device 3 CPU core 6 +lshn-qs-g2ri-2:96978:98064 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 10 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-g2ri-2:96977:98063 [2] NCCL INFO [Proxy Service] Device 2 CPU core 28 +lshn-qs-g2ri-2:96977:98065 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 128 +lshn-qs-g2ri-2:96976:98066 [1] NCCL INFO [Proxy Service] Device 1 CPU core 11 +lshn-qs-g2ri-2:96976:98067 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 112 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO ncclCommInitRankConfig comm 0x1e24ba00 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 commId 0x67adbc2d692fd1c8 - Init COMPLETE +lshn-qs-g2ri-2:96978:98055 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 1.02 (kernels 0.15, alloc 0.69, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.05, rest 0.09) +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO ncclCommInitRankConfig comm 0x1f1d4810 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 commId 0x67adbc2d692fd1c8 - Init COMPLETE +lshn-qs-g2ri-2:96977:98054 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.05, rest 0.09) +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO ncclCommInitRankConfig comm 0x1df75b30 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 commId 0x67adbc2d692fd1c8 - Init COMPLETE +lshn-qs-g2ri-2:96975:98052 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.05, rest 0.08) +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO ncclCommInitRankConfig comm 0x1dbf3b00 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 commId 0x67adbc2d692fd1c8 - Init COMPLETE +lshn-qs-g2ri-2:96976:98053 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 1.03 (kernels 0.16, alloc 0.69, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.05, rest 0.09) +[OpenTinker] 2025-12-02 00:12:47,140 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 00:12:47,141 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 00:12:47,229 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 00:12:47,229 - root - INFO - Training model with GRPO +INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 00:12:47 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896 +INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896 +INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896 +INFO 12-02 00:13:05 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 00:13:05 [__init__.py:1815] Using max model len 16896 +INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 00:13:06 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 00:13:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 00:13:08 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +[rank2]:[W1202 00:13:10.314419448 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +[rank3]:[W1202 00:13:10.398868254 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +[rank1]:[W1202 00:13:10.429521514 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +[rank0]:[W1202 00:13:10.622553720 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO ncclCommSplit comm 0x1f2f9af0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 1 color 2003953581 key 3- Init START +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO ncclCommSplit comm 0x1eca0630 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 1 color 2003953581 key 1- Init START +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO ncclCommSplit comm 0x20310430 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 1 color 2003953581 key 2- Init START +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO ncclCommSplit comm 0x1f5ed7a0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 1 color 2003953581 key 0- Init START +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO comm 0x1eca0630 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO comm 0x1f2f9af0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO comm 0x20310430 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO comm 0x1f5ed7a0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98205 [1] NCCL INFO [Proxy Service] Device 1 CPU core 104 +lshn-qs-g2ri-2:96976:98206 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 105 +lshn-qs-g2ri-2:96977:98207 [2] NCCL INFO [Proxy Service] Device 2 CPU core 4 +lshn-qs-g2ri-2:96977:98208 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 106 +lshn-qs-g2ri-2:96978:98209 [3] NCCL INFO [Proxy Service] Device 3 CPU core 23 +lshn-qs-g2ri-2:96978:98210 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 26 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:96975:98211 [0] NCCL INFO [Proxy Service] Device 0 CPU core 113 +lshn-qs-g2ri-2:96975:98212 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 30 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO ncclCommSplit comm 0x1f2f9af0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 1 color 2003953581 key 3 - Init COMPLETE +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO ncclCommSplit comm 0x20310430 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 1 color 2003953581 key 2 - Init COMPLETE +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO ncclCommSplit comm 0x1eca0630 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 1 color 2003953581 key 1 - Init COMPLETE +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO ncclCommSplit comm 0x1f5ed7a0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 1 color 2003953581 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98198 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.29 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.23) +lshn-qs-g2ri-2:96977:98195 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.38 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.32) +lshn-qs-g2ri-2:96976:98200 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.26 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.20) +lshn-qs-g2ri-2:96975:98204 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.02, graphs 0.01, connections 0.02, rest 0.01) +[Gloo] Rank 0[Gloo] Rank is connected to 3[Gloo] Rank 1[Gloo] Rank 32 is connected to peer ranks. Expected number of connected peer ranks is : is connected to 3 is connected to 33 peer ranks. 3 + peer ranks. Expected number of connected peer ranks is : peer ranks. Expected number of connected peer ranks is : Expected number of connected peer ranks is : 33 +3 + +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO ncclCommSplit comm 0x1f701db0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 2 color 59908776 key 0- Init START +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO comm 0x1f701db0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96975:98235 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3 +lshn-qs-g2ri-2:96975:98236 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 110 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO ncclCommSplit comm 0x1f701db0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 2 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96975:98230 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO ncclCommSplit comm 0x1edb4ce0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 4 color 440515407 key 0- Init START +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO comm 0x1edb4ce0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96976:98250 [1] NCCL INFO [Proxy Service] Device 1 CPU core 32 +lshn-qs-g2ri-2:96976:98251 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 136 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO ncclCommSplit comm 0x1edb4ce0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 4 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96976:98246 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO ncclCommSplit comm 0x20423ff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 6 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO comm 0x20423ff0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96977:98265 [2] NCCL INFO [Proxy Service] Device 2 CPU core 19 +lshn-qs-g2ri-2:96977:98266 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 41 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO ncclCommSplit comm 0x20423ff0 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 6 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96977:98261 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO ncclCommSplit comm 0x1f40c020 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 8 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO comm 0x1f40c020 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96978:98282 [3] NCCL INFO [Proxy Service] Device 3 CPU core 29 +lshn-qs-g2ri-2:96978:98283 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 105 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO ncclCommSplit comm 0x1f40c020 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 8 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98276 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO ncclCommSplit comm 0x20e5a6a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 9 color 59908776 key 0- Init START +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO comm 0x20e5a6a0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96975:98291 [0] NCCL INFO [Proxy Service] Device 0 CPU core 12 +lshn-qs-g2ri-2:96975:98292 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 124 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO ncclCommSplit comm 0x20e5a6a0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 9 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96975:98281 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.07) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO ncclCommSplit comm 0x204f15c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 11 color 440515407 key 0- Init START +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO comm 0x204f15c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96976:98306 [1] NCCL INFO [Proxy Service] Device 1 CPU core 14 +lshn-qs-g2ri-2:96976:98307 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 118 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO ncclCommSplit comm 0x204f15c0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 11 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96976:98302 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO ncclCommSplit comm 0x21b6ed10 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 13 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO comm 0x21b6ed10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96977:98321 [2] NCCL INFO [Proxy Service] Device 2 CPU core 104 +lshn-qs-g2ri-2:96977:98322 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 112 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO ncclCommSplit comm 0x21b6ed10 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 13 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96977:98317 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.04, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO ncclCommSplit comm 0x1f515b90 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 15 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO comm 0x1f515b90 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96978:98338 [3] NCCL INFO [Proxy Service] Device 3 CPU core 41 +lshn-qs-g2ri-2:96978:98339 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 27 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO ncclCommSplit comm 0x1f515b90 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 15 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98332 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO ncclCommSplit comm 0x20f622b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 16 color 59908776 key 0- Init START +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO comm 0x20f622b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96975:98347 [0] NCCL INFO [Proxy Service] Device 0 CPU core 3 +lshn-qs-g2ri-2:96975:98348 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 109 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO ncclCommSplit comm 0x20f622b0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 16 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96975:98337 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO ncclCommSplit comm 0x205f91d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 18 color 440515407 key 0- Init START +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO comm 0x205f91d0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96976:98362 [1] NCCL INFO [Proxy Service] Device 1 CPU core 7 +lshn-qs-g2ri-2:96976:98363 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 115 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO ncclCommSplit comm 0x205f91d0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 18 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96976:98358 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO ncclCommSplit comm 0x21c76920 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 20 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO comm 0x21c76920 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96977:98377 [2] NCCL INFO [Proxy Service] Device 2 CPU core 105 +lshn-qs-g2ri-2:96977:98378 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 28 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO ncclCommSplit comm 0x21c76920 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 20 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96977:98373 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO ncclCommSplit comm 0x20bd6290 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 22 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO comm 0x20bd6290 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96978:98394 [3] NCCL INFO [Proxy Service] Device 3 CPU core 26 +lshn-qs-g2ri-2:96978:98395 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 16 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO ncclCommSplit comm 0x20bd6290 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 22 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98388 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.01, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO ncclCommSplit comm 0x21069ec0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 23 color 59908776 key 0- Init START +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO comm 0x21069ec0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96975:98403 [0] NCCL INFO [Proxy Service] Device 0 CPU core 140 +lshn-qs-g2ri-2:96975:98404 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 13 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO ncclCommSplit comm 0x21069ec0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 23 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96975:98393 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO ncclCommSplit comm 0x20700de0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 25 color 440515407 key 0- Init START +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO comm 0x20700de0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96976:98418 [1] NCCL INFO [Proxy Service] Device 1 CPU core 132 +lshn-qs-g2ri-2:96976:98419 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 31 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO ncclCommSplit comm 0x20700de0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 25 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96976:98414 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO ncclCommSplit comm 0x21d7e530 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 27 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO comm 0x21d7e530 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96977:98433 [2] NCCL INFO [Proxy Service] Device 2 CPU core 3 +lshn-qs-g2ri-2:96977:98434 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 7 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO ncclCommSplit comm 0x21d7e530 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 27 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96977:98429 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO ncclCommSplit comm 0x20cddea0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 29 color 1301067556 key 0- Init START +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO comm 0x20cddea0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96978:98450 [3] NCCL INFO [Proxy Service] Device 3 CPU core 101 +lshn-qs-g2ri-2:96978:98451 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 19 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO ncclCommSplit comm 0x20cddea0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 29 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98444 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO ncclCommSplit comm 0x21171ad0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 30 color 59908776 key 0- Init START +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO comm 0x21171ad0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96975:98459 [0] NCCL INFO [Proxy Service] Device 0 CPU core 140 +lshn-qs-g2ri-2:96975:98460 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 105 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO ncclCommSplit comm 0x21171ad0 rank 0 nranks 1 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 30 color 59908776 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96975:98449 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO ncclCommSplit comm 0x208089f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 32 color 440515407 key 0- Init START +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO comm 0x208089f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96976:98474 [1] NCCL INFO [Proxy Service] Device 1 CPU core 139 +lshn-qs-g2ri-2:96976:98475 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 96 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO ncclCommSplit comm 0x208089f0 rank 0 nranks 1 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 32 color 440515407 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96976:98470 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO ncclCommSplit comm 0x21e86140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 34 color 1227022723 key 0- Init START +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO comm 0x21e86140 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96977:98489 [2] NCCL INFO [Proxy Service] Device 2 CPU core 32 +lshn-qs-g2ri-2:96977:98490 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 27 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO ncclCommSplit comm 0x21e86140 rank 0 nranks 1 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 34 color 1227022723 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96977:98485 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Using network Socket +INFO 12-02 00:13:11 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 00:13:11 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO ncclCommSplit comm 0x20de5ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 36 color 1301067556 key 0- Init START +INFO 12-02 00:13:11 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO comm 0x20de5ab0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0->-1 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-g2ri-2:96978:98501 [3] NCCL INFO [Proxy Service] Device 3 CPU core 104 +lshn-qs-g2ri-2:96978:98502 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 136 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO ncclCommSplit comm 0x20de5ab0 rank 0 nranks 1 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 36 color 1301067556 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98500 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +INFO 12-02 00:13:11 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 00:13:11 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 00:13:11 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 00:13:12 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 00:13:12 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 00:13:12 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 00:13:13 [weight_utils.py:406] No model.safetensors.index.json found in remote. + + Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model +INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +INFO 12-02 00:13:29 [llm.py:295] Supported_tasks: ('generate',) +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +INFO 12-02 00:13:29 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98622 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96977:98623 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96976:98620 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96975:98621 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +[OpenTinker] 2025-12-02 00:13:30,340 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value. +lshn-qs-g2ri-2:96977:96977 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:96978 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96976:96976 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96975:96975 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Using network Socket +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO ncclCommSplit comm 0x1684d6c0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 37 color 2003953581 key 3- Init START +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO ncclCommSplit comm 0x1b536c40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 37 color 2003953581 key 0- Init START +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO ncclCommSplit comm 0x1ab9a240 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 37 color 2003953581 key 1- Init START +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO ncclCommSplit comm 0x1c131dc0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 37 color 2003953581 key 2- Init START +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO MNNVL busId 0xa2000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO MNNVL busId 0x8000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO MNNVL busId 0x7e000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO MNNVL busId 0xc6000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Setting affinity for GPU 1 to 0-47,96-143 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Setting affinity for GPU 3 to 0-47,96-143 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Setting affinity for GPU 0 to 0-47,96-143 +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Setting affinity for GPU 2 to 0-47,96-143 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO comm 0x1ab9a240 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO comm 0x1684d6c0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO comm 0x1b536c40 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO comm 0x1c131dc0 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-g2ri-2:96976:98636 [1] NCCL INFO [Proxy Service] Device 1 CPU core 45 +lshn-qs-g2ri-2:96976:98637 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 3 +lshn-qs-g2ri-2:96978:98638 [3] NCCL INFO [Proxy Service] Device 3 CPU core 40 +lshn-qs-g2ri-2:96978:98639 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 102 +lshn-qs-g2ri-2:96977:98640 [2] NCCL INFO [Proxy Service] Device 2 CPU core 8 +lshn-qs-g2ri-2:96977:98641 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 107 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-g2ri-2:96975:98642 [0] NCCL INFO [Proxy Service] Device 0 CPU core 21 +lshn-qs-g2ri-2:96975:98643 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 35 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO ncclCommSplit comm 0x1684d6c0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId c6000 parent 0x1e24ba00 splitCount 37 color 2003953581 key 3 - Init COMPLETE +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO ncclCommSplit comm 0x1c131dc0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId a2000 parent 0x1f1d4810 splitCount 37 color 2003953581 key 2 - Init COMPLETE +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO ncclCommSplit comm 0x1ab9a240 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 7e000 parent 0x1dbf3b00 splitCount 37 color 2003953581 key 1 - Init COMPLETE +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO ncclCommSplit comm 0x1b536c40 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 8000 parent 0x1df75b30 splitCount 37 color 2003953581 key 0 - Init COMPLETE +lshn-qs-g2ri-2:96978:98629 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07) +lshn-qs-g2ri-2:96977:98626 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07) +lshn-qs-g2ri-2:96976:98632 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.12 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.07) +lshn-qs-g2ri-2:96975:98635 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.01, connections 0.02, rest 0.02) +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 12/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 13/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 14/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 15/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 16/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 17/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 18/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 19/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 20/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 21/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 22/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Channel 23/0 : 3[3] -> 0[0] via P2P/CUMEM +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM +lshn-qs-g2ri-2:96977:98645 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96976:98647 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96975:98646 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-g2ri-2:96978:98644 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 00:13:32 [block_pool.py:292] Successfully reset prefix cache +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + + 0%| | 0/1024 [00:00](https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8lt7zamw) + + +This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300). + +### Framework versions + +- TRL: 0.25.0 +- Transformers: 4.57.1 +- Pytorch: 2.8.0 +- Datasets: 4.4.1 +- Tokenizers: 0.22.1 + +## Citations + +Cite GRPO as: + +```bibtex +@article{shao2024deepseekmath, + title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}}, + author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo}, + year = 2024, + eprint = {arXiv:2402.03300}, +} + +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/README.md b/dapo_lorafa_20251202_173337/checkpoint-576/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b3fac4aca7a7fabb3a0972e6c9281e23853e2816 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/README.md @@ -0,0 +1,209 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +- grpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.17.1 \ No newline at end of file diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json b/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c074229bb0545de98cc4b88111a8b54705fd6f30 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/adapter_config.json @@ -0,0 +1,42 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "bias": "none", + "corda_config": null, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "qalora_group_size": 16, + "r": 32, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "o_proj", + "down_proj", + "v_proj", + "gate_proj", + "up_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/latest b/dapo_lorafa_20251202_173337/checkpoint-576/latest new file mode 100644 index 0000000000000000000000000000000000000000..1a40031386820b60f3a54acbdbae4813e4a986c7 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/latest @@ -0,0 +1 @@ +global_step576 \ No newline at end of file diff --git a/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json b/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d252dd4e5764106823080946500c02a8ed8c90c9 --- /dev/null +++ b/dapo_lorafa_20251202_173337/checkpoint-576/tokenizer_config.json @@ -0,0 +1,194 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 16384, + "pad_token": "<|end▁of▁sentence|>", + "sp_model_kwargs": {}, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/dapo_lorafa_20251202_173337/output.log b/dapo_lorafa_20251202_173337/output.log new file mode 100644 index 0000000000000000000000000000000000000000..114d01ebb27c49dc1857efbb4b639c968fd9b349 --- /dev/null +++ b/dapo_lorafa_20251202_173337/output.log @@ -0,0 +1,3293 @@ +W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] +W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] ***************************************** +W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +W1202 17:34:02.231000 1217291 torch/distributed/run.py:774] ***************************************** +INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda. +INFO 12-02 17:34:24 [__init__.py:216] Automatically detected platform cuda. +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 17:34:30,825 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 17:34:30,827 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 17:34:30,831 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it +TrainConfig(common=CommonConfig(seed=42, debug=False), model=ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', dtype='bfloat16'), peft=PeftConfig(type='lorafa', use_peft=True, task_type='CAUSAL_LM', r=32, lora_alpha=64, lora_dropout=0.05, total_step=1000, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj']), training=TrainingConfig(learning_rate=1e-05, output_dir='outputs/dapo_lorafa_20251202_173337', run_name='outputs/dapo_lorafa_20251202_173337', resume_from_checkpoint='outputs/train/dapo_lorafa_20251201_161746/checkpoint-512', remove_unused_columns=False, gradient_accumulation_steps=8, num_train_epochs=1, max_completion_length=16384, num_generations=8, max_prompt_length=512, logging_steps=1, save_strategy='steps', save_steps=64, max_steps=1024, use_vllm=True, vllm_mode='colocate', vllm_gpu_memory_utilization=0.4, use_liger_kernel=False, epsilon_high=0.28, lr_scheduler_type='constant', lr_scheduler_kwargs={'min_lr_rate': 0.1}, loss_type='dapo', report_to=['wandb'], beta=0.0, warmup_ratio=0.0, per_device_train_batch_size=4, top_entropy_quantile=1.0), logging=LoggingConfig(trackio_space_id='Open-Tinker/Open-Tinker', trackio_project='grpo-full-qwen3-4b', wandb_project='grpo-full-qwen3-4b'), dataset=DatasetConfig(dataset_name_or_path='open-r1/DAPO-Math-17k-Processed', example_numbers=1000000000)) +[OpenTinker] 2025-12-02 17:34:30,836 - root - INFO - Output directory outputs/dapo_lorafa_20251202_173337 already exists, using it +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: Currently logged in as: mikastars (mikastars-zhejiang-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +wandb: setting up run 8lt7zamw +wandb: setting up run t4d1xrj2 +wandb: setting up run hpd46kjr +wandb: setting up run qkisy38r +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-8lt7zamw +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lorafa_20251202_173337 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/8lt7zamw +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-qkisy38r +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lorafa_20251202_173337 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/qkisy38r +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-t4d1xrj2 +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lorafa_20251202_173337 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/t4d1xrj2 +wandb: Tracking run with wandb version 0.22.3 +wandb: Run data is saved locally in /mnt/tidal-alsh-hilab/dataset/diandian/user/qingyu/Tina/wandb/run-20251202_173434-hpd46kjr +wandb: Run `wandb offline` to turn off syncing. +wandb: Syncing run outputs/dapo_lorafa_20251202_173337 +wandb: ⭐️ View project at https://wandb.ai/mikastars-zhejiang-university/Tina +wandb: 🚀 View run at https://wandb.ai/mikastars-zhejiang-university/Tina/runs/hpd46kjr +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 17:34:36,641 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 17:34:36,642 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +wandb: Detected [huggingface_hub.inference, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +[OpenTinker] 2025-12-02 17:34:36,765 - root - INFO - Wandb initialized successfully +[OpenTinker] 2025-12-02 17:34:36,766 - root - INFO - Loading tokenizer from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 17:34:37,986 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 17:34:38,089 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 17:34:39,180 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 17:34:39,449 - root - INFO - Loading dataset from open-r1/DAPO-Math-17k-Processed +[OpenTinker] 2025-12-02 17:34:41,016 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 17:34:41,112 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-12-02 17:34:42,227 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 17:34:42,438 - root - INFO - Loading model from deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B +[OpenTinker] 2025-12-02 17:34:42,554 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 17:34:42,554 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 17:34:42,555 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 17:34:42,555 - root - INFO - Detected PEFT configuration, configuring lora +`torch_dtype` is deprecated! Use `dtype` instead! +`torch_dtype` is deprecated! Use `dtype` instead! +[OpenTinker] 2025-12-02 17:34:43,055 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 17:34:43,055 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 17:34:43,554 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp3lmny6fn/test.c -o /tmp/tmp3lmny6fn/test.o +[OpenTinker] 2025-12-02 17:34:43,562 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 17:34:43,568 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 17:34:43,568 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp1e_h4zes/test.c -o /tmp/tmp1e_h4zes/test.o +[OpenTinker] 2025-12-02 17:34:43,592 - root - INFO - Model loaded successfully +[OpenTinker] 2025-12-02 17:34:43,597 - root - INFO - Detected PEFT configuration, configuring lora +[OpenTinker] 2025-12-02 17:34:43,618 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp3lmny6fn/test.o -laio -o /tmp/tmp3lmny6fn/a.out +[OpenTinker] 2025-12-02 17:34:43,632 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp1e_h4zes/test.o -laio -o /tmp/tmp1e_h4zes/a.out +[OpenTinker] 2025-12-02 17:34:43,943 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 17:34:43,976 - root - INFO - Lora configured successfully +[OpenTinker] 2025-12-02 17:34:44,097 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp0gl4wseb/test.c -o /tmp/tmp0gl4wseb/test.o +[OpenTinker] 2025-12-02 17:34:44,110 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpbe2hoe5y/test.c -o /tmp/tmpbe2hoe5y/test.o +[OpenTinker] 2025-12-02 17:34:44,127 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp0gl4wseb/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp0gl4wseb/a.out +[OpenTinker] 2025-12-02 17:34:44,144 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpbe2hoe5y/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpbe2hoe5y/a.out +[OpenTinker] 2025-12-02 17:34:44,259 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpj4z5ww_7/test.c -o /tmp/tmpj4z5ww_7/test.o +[OpenTinker] 2025-12-02 17:34:44,282 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpm82vly95/test.c -o /tmp/tmpm82vly95/test.o +[OpenTinker] 2025-12-02 17:34:44,306 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpj4z5ww_7/test.o -laio -o /tmp/tmpj4z5ww_7/a.out +[OpenTinker] 2025-12-02 17:34:44,323 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpm82vly95/test.o -laio -o /tmp/tmpm82vly95/a.out +[OpenTinker] 2025-12-02 17:34:44,750 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmp0uudj5q4/test.c -o /tmp/tmp0uudj5q4/test.o +[OpenTinker] 2025-12-02 17:34:44,775 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmp0uudj5q4/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmp0uudj5q4/a.out +[OpenTinker] 2025-12-02 17:34:44,792 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /root/miniconda3/include -fPIC -O2 -isystem /root/miniconda3/include -fPIC -c /tmp/tmpojuli1hs/test.c -o /tmp/tmpojuli1hs/test.o +[OpenTinker] 2025-12-02 17:34:44,817 - root - INFO - gcc -pthread -B /root/miniconda3/compiler_compat /tmp/tmpojuli1hs/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpojuli1hs/a.out +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO cudaDriverVersion 12090 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Bootstrap: Using eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO NCCL version 2.27.3+cuda12.9 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NET/Plugin: Could not find: libnccl-net-none.so. +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1. +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NET/Socket : Using [0]eth0:10.146.225.173<0> +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Initialized NET plugin Socket +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ce20510 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x5b31249cba627096 - Init START +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO ncclCommInitRankConfig comm 0x1f12a6e0 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x5b31249cba627096 - Init START +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO ncclCommInitRankConfig comm 0x1ed808d0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x5b31249cba627096 - Init START +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO ncclCommInitRankConfig comm 0x1dced410 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x5b31249cba627096 - Init START +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO RAS client listening socket at ::1<28028> +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Bootstrap timings total 0.001767 (create 0.000018, send 0.000086, recv 0.000183, ring 0.000072, delay 0.000000) +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Bootstrap timings total 0.001716 (create 0.000022, send 0.000094, recv 0.000175, ring 0.001058, delay 0.000000) +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Bootstrap timings total 0.026047 (create 0.000019, send 0.000085, recv 0.025513, ring 0.000075, delay 0.000001) +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Bootstrap timings total 0.002694 (create 0.000020, send 0.000107, recv 0.000116, ring 0.001088, delay 0.000001) +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_NVLS_ENABLE set by environment to 0. +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO comm 0x1f12a6e0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO comm 0x1ce20510 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO comm 0x1dced410 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO comm 0x1ed808d0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4. +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1217539:1218641 [2] NCCL INFO [Proxy Service] Device 2 CPU core 162 +lshn-qs-pjul-8:1217539:1218642 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 175 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1217540:1218643 [3] NCCL INFO [Proxy Service] Device 3 CPU core 167 +lshn-qs-pjul-8:1217540:1218644 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 81 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1217538:1218645 [1] NCCL INFO [Proxy Service] Device 1 CPU core 68 +lshn-qs-pjul-8:1217538:1218646 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 165 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1217537:1218647 [0] NCCL INFO [Proxy Service] Device 0 CPU core 50 +lshn-qs-pjul-8:1217537:1218648 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 151 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO ncclCommInitRankConfig comm 0x1f12a6e0 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 commId 0x5b31249cba627096 - Init COMPLETE +lshn-qs-pjul-8:1217540:1218636 [3] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 4 total 0.97 (kernels 0.21, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.06, rest 0.06) +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO ncclCommInitRankConfig comm 0x1dced410 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 commId 0x5b31249cba627096 - Init COMPLETE +lshn-qs-pjul-8:1217539:1218634 [2] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 4 total 0.97 (kernels 0.21, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.07, rest 0.06) +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO ncclCommInitRankConfig comm 0x1ed808d0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 commId 0x5b31249cba627096 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218635 [1] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 4 total 0.97 (kernels 0.22, alloc 0.58, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.01, connections 0.07, rest 0.06) +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ce20510 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 commId 0x5b31249cba627096 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218633 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 4 total 1.00 (kernels 0.21, alloc 0.58, bootstrap 0.03, allgathers 0.01, topo 0.02, graphs 0.02, connections 0.06, rest 0.06) +[OpenTinker] 2025-12-02 17:34:48,206 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 17:34:48,207 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 17:34:48,293 - root - INFO - Training model with GRPO +[OpenTinker] 2025-12-02 17:34:48,293 - root - INFO - Training model with GRPO +INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 0, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 1, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 3, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 17:34:48 [utils.py:328] non-default args: {'seed': 2, 'max_model_len': 16896, 'distributed_executor_backend': 'external_launcher', 'gpu_memory_utilization': 0.4, 'max_num_batched_tokens': 4096, 'max_num_seqs': 32, 'logprobs_mode': 'processed_logprobs', 'disable_log_stats': True, 'model_impl': 'vllm', 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B'} +INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896 +INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896 +INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896 +INFO 12-02 17:35:06 [__init__.py:742] Resolved architecture: Qwen2ForCausalLM +INFO 12-02 17:35:06 [__init__.py:1815] Using max model len 16896 +INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 17:35:07 [parallel.py:348] Disabling V1 multiprocessing for external launcher. +INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 17:35:08 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096. +INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=3, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=2, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=1, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +INFO 12-02 17:35:09 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16896, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":1,"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[64,56,48,40,32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":64,"local_cache_dir":null} +[rank3]:[W1202 17:35:11.819826985 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +[rank1]:[W1202 17:35:11.853803987 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +[rank0]:[W1202 17:35:11.898406648 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +[rank2]:[W1202 17:35:11.026588609 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator()) +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO ncclCommSplit comm 0x1fe18920 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 1 color 2003953581 key 1- Init START +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO ncclCommSplit comm 0x201b9280 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 1 color 2003953581 key 3- Init START +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO ncclCommSplit comm 0x1ed7b660 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 1 color 2003953581 key 2- Init START +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO ncclCommSplit comm 0x1e551b60 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 1 color 2003953581 key 0- Init START +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO comm 0x201b9280 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO comm 0x1ed7b660 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO comm 0x1e551b60 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO comm 0x1fe18920 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1218788 [3] NCCL INFO [Proxy Service] Device 3 CPU core 72 +lshn-qs-pjul-8:1217540:1218789 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 76 +lshn-qs-pjul-8:1217539:1218790 [2] NCCL INFO [Proxy Service] Device 2 CPU core 80 +lshn-qs-pjul-8:1217539:1218791 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 181 +lshn-qs-pjul-8:1217538:1218792 [1] NCCL INFO [Proxy Service] Device 1 CPU core 87 +lshn-qs-pjul-8:1217538:1218793 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 189 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1217537:1218794 [0] NCCL INFO [Proxy Service] Device 0 CPU core 190 +lshn-qs-pjul-8:1217537:1218795 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 155 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO ncclCommSplit comm 0x201b9280 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 1 color 2003953581 key 3 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO ncclCommSplit comm 0x1fe18920 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 1 color 2003953581 key 1 - Init COMPLETE +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO ncclCommSplit comm 0x1ed7b660 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 1 color 2003953581 key 2 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO ncclCommSplit comm 0x1e551b60 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 1 color 2003953581 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1218778 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.29 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.22) +lshn-qs-pjul-8:1217538:1218781 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.26 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.18) +lshn-qs-pjul-8:1217539:1218787 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.01) +lshn-qs-pjul-8:1217537:1218784 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.21 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.02, connections 0.02, rest 0.14) +[Gloo] Rank 0 is connected to 3[Gloo] Rank [Gloo] Rank peer ranks. Expected number of connected peer ranks is : 1 is connected to [Gloo] Rank 32 is connected to 33 peer ranks. is connected to 3 peer ranks. +Expected number of connected peer ranks is : 33 peer ranks. Expected number of connected peer ranks is : 3 +Expected number of connected peer ranks is : 3 + +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO ncclCommSplit comm 0x1e665b00 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 2 color 59908776 key 0- Init START +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO comm 0x1e665b00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217537:1218818 [0] NCCL INFO [Proxy Service] Device 0 CPU core 67 +lshn-qs-pjul-8:1217537:1218819 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 68 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO ncclCommSplit comm 0x1e665b00 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 2 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218814 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO ncclCommSplit comm 0x1ff2b010 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 4 color 440515407 key 0- Init START +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO comm 0x1ff2b010 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217538:1218833 [1] NCCL INFO [Proxy Service] Device 1 CPU core 63 +lshn-qs-pjul-8:1217538:1218834 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 73 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO ncclCommSplit comm 0x1ff2b010 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 4 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218829 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO ncclCommSplit comm 0x1ee8fe50 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 6 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO comm 0x1ee8fe50 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217539:1218848 [2] NCCL INFO [Proxy Service] Device 2 CPU core 153 +lshn-qs-pjul-8:1217539:1218849 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 156 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO ncclCommSplit comm 0x1ee8fe50 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 6 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1218844 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO ncclCommSplit comm 0x202cdd80 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 8 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO comm 0x202cdd80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217540:1218865 [3] NCCL INFO [Proxy Service] Device 3 CPU core 172 +lshn-qs-pjul-8:1217540:1218866 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 181 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO ncclCommSplit comm 0x202cdd80 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 8 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1218859 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO ncclCommSplit comm 0x1fdc0650 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 9 color 59908776 key 0- Init START +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO comm 0x1fdc0650 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217537:1218874 [0] NCCL INFO [Proxy Service] Device 0 CPU core 147 +lshn-qs-pjul-8:1217537:1218875 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 52 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO ncclCommSplit comm 0x1fdc0650 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 9 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218864 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.09 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.06) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO ncclCommSplit comm 0x216803c0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 11 color 440515407 key 0- Init START +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO comm 0x216803c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217538:1218889 [1] NCCL INFO [Proxy Service] Device 1 CPU core 65 +lshn-qs-pjul-8:1217538:1218890 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 62 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO ncclCommSplit comm 0x216803c0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 11 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218884 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO ncclCommSplit comm 0x205ea2f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 13 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO comm 0x205ea2f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217539:1218904 [2] NCCL INFO [Proxy Service] Device 2 CPU core 188 +lshn-qs-pjul-8:1217539:1218905 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 94 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO ncclCommSplit comm 0x205ea2f0 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 13 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1218900 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO ncclCommSplit comm 0x21a257b0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 15 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO comm 0x21a257b0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217540:1218921 [3] NCCL INFO [Proxy Service] Device 3 CPU core 67 +lshn-qs-pjul-8:1217540:1218922 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 164 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO ncclCommSplit comm 0x21a257b0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 15 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1218915 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO ncclCommSplit comm 0x1fec8260 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 16 color 59908776 key 0- Init START +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO comm 0x1fec8260 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217537:1218930 [0] NCCL INFO [Proxy Service] Device 0 CPU core 150 +lshn-qs-pjul-8:1217537:1218931 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 171 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO ncclCommSplit comm 0x1fec8260 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 16 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218920 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.04) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO ncclCommSplit comm 0x21787fd0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 18 color 440515407 key 0- Init START +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO comm 0x21787fd0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217538:1218945 [1] NCCL INFO [Proxy Service] Device 1 CPU core 60 +lshn-qs-pjul-8:1217538:1218946 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 169 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO ncclCommSplit comm 0x21787fd0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 18 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218941 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.06, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO ncclCommSplit comm 0x206f1f00 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 20 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO comm 0x206f1f00 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217539:1218960 [2] NCCL INFO [Proxy Service] Device 2 CPU core 151 +lshn-qs-pjul-8:1217539:1218961 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 63 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO ncclCommSplit comm 0x206f1f00 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 20 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1218956 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO ncclCommSplit comm 0x21b2d3c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 22 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO comm 0x21b2d3c0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217540:1218977 [3] NCCL INFO [Proxy Service] Device 3 CPU core 181 +lshn-qs-pjul-8:1217540:1218978 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 189 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO ncclCommSplit comm 0x21b2d3c0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 22 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1218971 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.07 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.05, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO ncclCommSplit comm 0x1ffcfe70 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 23 color 59908776 key 0- Init START +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO comm 0x1ffcfe70 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217537:1218986 [0] NCCL INFO [Proxy Service] Device 0 CPU core 53 +lshn-qs-pjul-8:1217537:1218987 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 62 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO ncclCommSplit comm 0x1ffcfe70 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 23 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217537:1218976 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.04, rest 0.08) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO ncclCommSplit comm 0x2188fbe0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 25 color 440515407 key 0- Init START +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO comm 0x2188fbe0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217538:1219001 [1] NCCL INFO [Proxy Service] Device 1 CPU core 184 +lshn-qs-pjul-8:1217538:1219002 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 48 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO ncclCommSplit comm 0x2188fbe0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 25 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217538:1218997 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.06 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.01, graphs 0.00, connections 0.03, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO ncclCommSplit comm 0x207f9b10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 27 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO comm 0x207f9b10 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217539:1219016 [2] NCCL INFO [Proxy Service] Device 2 CPU core 157 +lshn-qs-pjul-8:1217539:1219017 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 67 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO ncclCommSplit comm 0x207f9b10 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 27 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1219012 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.10 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.03, graphs 0.00, connections 0.07, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO ncclCommSplit comm 0x21c34fd0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 29 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO comm 0x21c34fd0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217540:1219033 [3] NCCL INFO [Proxy Service] Device 3 CPU core 58 +lshn-qs-pjul-8:1217540:1219034 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 74 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO ncclCommSplit comm 0x21c34fd0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 29 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1219027 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.04 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.03, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO ncclCommSplit comm 0x200d7a80 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 30 color 59908776 key 0- Init START +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO comm 0x200d7a80 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217537:1219042 [0] NCCL INFO [Proxy Service] Device 0 CPU core 68 +lshn-qs-pjul-8:1217537:1219043 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 177 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO ncclCommSplit comm 0x200d7a80 rank 0 nranks 1 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 30 color 59908776 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217537:1219032 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.17 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.02, topo 0.02, graphs 0.00, connections 0.08, rest 0.05) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO ncclCommSplit comm 0x219977f0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 32 color 440515407 key 0- Init START +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO comm 0x219977f0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217538:1219058 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 55 +lshn-qs-pjul-8:1217538:1219057 [1] NCCL INFO [Proxy Service] Device 1 CPU core 89 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO ncclCommSplit comm 0x219977f0 rank 0 nranks 1 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 32 color 440515407 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217538:1219053 [1] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.05 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.02, graphs 0.00, connections 0.02, rest 0.01) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Rank 3 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO ncclCommSplit comm 0x20901720 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 34 color 1227022723 key 0- Init START +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO comm 0x20901720 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217539:1219072 [2] NCCL INFO [Proxy Service] Device 2 CPU core 147 +lshn-qs-pjul-8:1217539:1219073 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 85 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO ncclCommSplit comm 0x20901720 rank 0 nranks 1 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 34 color 1227022723 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1219068 [2] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Rank 2 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Rank 1 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Rank 0 has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Using network Socket +INFO 12-02 17:35:12 [parallel_state.py:1165] rank 1 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 17:35:12 [parallel_state.py:1165] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 17:35:12 [parallel_state.py:1165] rank 2 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO ncclCommSplit comm 0x21d3cbe0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 36 color 1301067556 key 0- Init START +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO comm 0x21d3cbe0 rank 0 nRanks 1 nNodes 1 localRanks 1 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 00/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 01/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 02/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 03/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 04/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 05/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 06/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 07/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 08/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 09/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 10/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 11/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 12/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 13/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 14/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 15/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 16/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 17/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 18/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 19/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 20/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 21/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 22/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 23/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 24/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 25/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 26/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 27/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 28/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 29/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 30/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 31/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 32/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 33/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 34/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 35/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 36/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 37/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 38/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 39/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 40/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 41/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 42/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 43/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 44/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 45/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 46/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 47/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 48/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 49/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 50/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 51/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 52/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 53/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 54/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 55/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 56/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 57/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 58/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 59/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 60/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 61/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 62/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Channel 63/64 : 0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1 [32] -1/-1/-1->0->-1 [33] -1/-1/-1->0->-1 [34] -1/-1/-1->0->-1 [35] -1/-1/-1->0->-1 [36] -1/-1/-1->0->-1 [37] -1/-1/-1->0->-1 [38] -1/-1/-1->0->-1 [39] -1/-1/-1->0->-1 [40] -1/-1/-1->0->-1 [41] -1/-1/-1->0->-1 [42] -1/-1/-1->0->-1 [43] -1/-1/-1->0->-1 [44] -1/-1/-1->0->-1 [45] -1/-1/-1->0->-1 [46] -1/-1/-1->0 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Check P2P Type isAllDirectP2p 0 directMode 0 +lshn-qs-pjul-8:1217540:1219084 [3] NCCL INFO [Proxy Service] Device 3 CPU core 83 +lshn-qs-pjul-8:1217540:1219085 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 189 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO 64 coll channels, 64 collnet channels, 0 nvls channels, 64 p2p channels, 64 p2p channels per peer +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO ncclCommSplit comm 0x21d3cbe0 rank 0 nranks 1 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 36 color 1301067556 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217540:1219083 [3] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 1 total 0.03 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.00, topo 0.01, graphs 0.00, connections 0.02, rest 0.00) +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +INFO 12-02 17:35:12 [parallel_state.py:1165] rank 3 in world size 4 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 +INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 17:35:13 [gpu_model_runner.py:2338] Starting to load model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B... +INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 17:35:13 [gpu_model_runner.py:2370] Loading model from scratch... +INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 17:35:13 [cuda.py:362] Using Flash Attention backend on V1 engine. +INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 17:35:14 [weight_utils.py:348] Using model weights format ['*.safetensors'] +INFO 12-02 17:35:14 [weight_utils.py:406] No model.safetensors.index.json found in remote. +INFO 12-02 17:35:15 [weight_utils.py:406] No model.safetensors.index.json found in remote. + + Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM +INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',) +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM +INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM +INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM +INFO 12-02 17:35:32 [llm.py:295] Supported_tasks: ('generate',) +INFO 12-02 17:35:32 [__init__.py:36] No IOProcessor plugins requested by the model +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219210 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217540:1219211 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217538:1219209 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217537:1219208 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 151646, 'pad_token_id': 151643}. +[OpenTinker] 2025-12-02 17:35:33,615 - accelerate.accelerator - WARNING - Gradient accumulation steps mismatch: GradientAccumulationPlugin has 1, DeepSpeed config has 8. Using DeepSpeed's value. +lshn-qs-pjul-8:1217540:1217540 [3] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217538:1217538 [1] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217537:1217537 [0] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1217539 [2] NCCL INFO Comm config Blocking set to 1 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Assigned NET plugin Socket to comm +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Using network Socket +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO ncclCommSplit comm 0x1abf0240 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 37 color 2003953581 key 2- Init START +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO ncclCommSplit comm 0x17706840 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 37 color 2003953581 key 3- Init START +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO ncclCommSplit comm 0x1c0253c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 37 color 2003953581 key 1- Init START +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO ncclCommSplit comm 0x1a380640 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 37 color 2003953581 key 0- Init START +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO MNNVL busId 0x1c7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO MNNVL busId 0x17f000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO MNNVL busId 0x109000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO MNNVL busId 0x1a3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Setting affinity for GPU 7 to 48-95,144-191 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Setting affinity for GPU 6 to 48-95,144-191 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Setting affinity for GPU 5 to 48-95,144-191 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Setting affinity for GPU 4 to 48-95,144-191 +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO comm 0x17706840 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO comm 0x1a380640 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO comm 0x1abf0240 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO comm 0x1c0253c0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0 +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2 [12] -1/-1/-1->3->2 [13] -1/-1/-1->3->2 [14] -1/-1/-1->3->2 [15] -1/-1/-1->3->2 [16] -1/-1/-1->3->2 [17] -1/-1/-1->3->2 [18] -1/-1/-1->3->2 [19] -1/-1/-1->3->2 [20] -1/-1/-1->3->2 [21] -1/-1/-1->3->2 [22] -1/-1/-1->3->2 [23] -1/-1/-1->3->2 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 00/24 : 0 1 2 3 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 01/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 02/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 03/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 04/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 05/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 06/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 07/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 08/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 09/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 10/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 11/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 12/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 13/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 14/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 15/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 16/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 17/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 18/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 19/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 20/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 21/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 22/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Channel 23/24 : 0 1 2 3 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO P2P Chunksize set to 524288 +lshn-qs-pjul-8:1217540:1219226 [3] NCCL INFO [Proxy Service] Device 3 CPU core 61 +lshn-qs-pjul-8:1217540:1219227 [3] NCCL INFO [Proxy Service UDS] Device 3 CPU core 68 +lshn-qs-pjul-8:1217538:1219228 [1] NCCL INFO [Proxy Service] Device 1 CPU core 151 +lshn-qs-pjul-8:1217538:1219229 [1] NCCL INFO [Proxy Service UDS] Device 1 CPU core 82 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Check P2P Type isAllDirectP2p 1 directMode 0 +lshn-qs-pjul-8:1217537:1219230 [0] NCCL INFO [Proxy Service] Device 0 CPU core 179 +lshn-qs-pjul-8:1217537:1219231 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 84 +lshn-qs-pjul-8:1217539:1219232 [2] NCCL INFO [Proxy Service] Device 2 CPU core 86 +lshn-qs-pjul-8:1217539:1219233 [2] NCCL INFO [Proxy Service UDS] Device 2 CPU core 87 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512 +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO CC Off, workFifoBytes 1048576 +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO ncclCommSplit comm 0x1abf0240 rank 2 nranks 4 cudaDev 2 nvmlDev 6 busId 1a3000 parent 0x1dced410 splitCount 37 color 2003953581 key 2 - Init COMPLETE +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO ncclCommSplit comm 0x1a380640 rank 0 nranks 4 cudaDev 0 nvmlDev 4 busId 109000 parent 0x1ce20510 splitCount 37 color 2003953581 key 0 - Init COMPLETE +lshn-qs-pjul-8:1217539:1219225 [2] NCCL INFO Init timings - ncclCommSplit: rank 2 nranks 4 total 0.08 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.01) +lshn-qs-pjul-8:1217537:1219222 [0] NCCL INFO Init timings - ncclCommSplit: rank 0 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.06) +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO ncclCommSplit comm 0x17706840 rank 3 nranks 4 cudaDev 3 nvmlDev 7 busId 1c7000 parent 0x1f12a6e0 splitCount 37 color 2003953581 key 3 - Init COMPLETE +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO ncclCommSplit comm 0x1c0253c0 rank 1 nranks 4 cudaDev 1 nvmlDev 5 busId 17f000 parent 0x1ed808d0 splitCount 37 color 2003953581 key 1 - Init COMPLETE +lshn-qs-pjul-8:1217540:1219216 [3] NCCL INFO Init timings - ncclCommSplit: rank 3 nranks 4 total 0.17 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.03, rest 0.10) +lshn-qs-pjul-8:1217538:1219219 [1] NCCL INFO Init timings - ncclCommSplit: rank 1 nranks 4 total 0.13 (kernels 0.00, alloc 0.00, bootstrap 0.00, allgathers 0.01, topo 0.03, graphs 0.01, connections 0.02, rest 0.06) +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 00/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 00/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 01/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 00/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 01/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 00/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 02/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 01/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 02/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 01/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 03/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 03/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 02/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 02/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 04/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 04/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 03/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 05/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 03/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 05/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 04/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 04/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 06/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 06/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 05/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 07/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 07/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 05/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 06/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 08/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 08/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 06/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 07/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 09/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 09/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 07/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 08/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 10/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 10/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 08/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 09/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 11/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 11/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 09/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 10/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 12/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 12/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 10/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 11/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 13/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 13/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 11/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 12/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 14/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 14/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 12/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 13/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 15/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 15/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 13/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 14/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 16/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 16/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 14/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 15/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 17/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 17/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 15/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 16/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 18/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 18/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 16/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 17/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 19/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 19/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 17/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 18/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 20/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 20/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 18/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 19/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 21/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 21/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 19/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 20/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 22/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 22/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 20/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 21/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Channel 23/0 : 1[5] -> 2[6] via P2P/CUMEM +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Channel 23/0 : 0[4] -> 1[5] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 21/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 22/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 22/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Channel 23/0 : 3[7] -> 0[4] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Channel 23/0 : 2[6] -> 3[7] via P2P/CUMEM +lshn-qs-pjul-8:1217539:1219234 [2] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217538:1219236 [1] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217537:1219235 [0] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +lshn-qs-pjul-8:1217540:1219237 [3] NCCL INFO Connected all rings, use ring PXN 0 GDR 1 +INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache +INFO 12-02 17:35:36 [block_pool.py:292] Successfully reset prefix cache +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + + 0%| | 0/1024 [00:00