{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11428571428571428, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2700.4271850585938, "cov_mean": -6.0587970438064076e-05, "cov_std": 0.35307812318205833, "entropy": 0.36474609375, "epoch": 0.001142857142857143, "grad_norm": 0.45541471242904663, "kl": 0.0, "learning_rate": 1e-07, "loss": -0.0382, "reward": 0.7604166893288493, "reward_std": 0.4268697127699852, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.5104166669771075, "step": 1, "w_high_ratio": 0.2200421690940857, "w_low_ratio": 0.03663695091381669, "w_max": 2.1593789756298065, "w_mean": 1.4711343348026276, "w_min": 6.525355682266089e-35, "w_std": 0.2659660503268242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3127.3958435058594, "cov_mean": -2.155053698515985e-05, "cov_std": 0.310540571808815, "entropy": 0.353515625, "epoch": 0.002285714285714286, "grad_norm": 0.5143813490867615, "kl": 0.0, "learning_rate": 2e-07, "loss": 0.0049, "reward": 0.6458333637565374, "reward_std": 0.4249730706214905, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.3645833386108279, "step": 2, "w_high_ratio": 0.05183619633316994, "w_low_ratio": 0.036958135198801756, "w_max": 1.8325217366218567, "w_mean": 1.2113382518291473, "w_min": 0.0, "w_std": 0.20957503467798233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3791.375, "cov_mean": -3.563215068425052e-05, "cov_std": 0.28256653994321823, "entropy": 0.4658203125, "epoch": 0.0034285714285714284, "grad_norm": 0.2701888084411621, "kl": 4.756450653076172e-05, "learning_rate": 3e-07, "loss": 0.0344, "reward": 0.16666667349636555, "reward_std": 0.3025414012372494, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.14583333395421505, "step": 3, "w_high_ratio": 0.0, "w_low_ratio": 0.03658500872552395, "w_max": 1.348844289779663, "w_mean": 1.0439709424972534, "w_min": 0.0, "w_std": 0.15747325122356415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2446.1250610351562, "cov_mean": -2.657165350683499e-05, "cov_std": 0.47042107582092285, "entropy": 0.4052734375, "epoch": 0.004571428571428572, "grad_norm": 0.6580816507339478, "kl": 2.866983413696289e-05, "learning_rate": 4e-07, "loss": -0.0116, "reward": 0.8541666865348816, "reward_std": 0.5623367577791214, "rewards/accuracy_reward": 0.19791667070239782, "rewards/format_reward": 0.6562500074505806, "step": 4, "w_high_ratio": 0.2048901468515396, "w_low_ratio": 0.04687658231705427, "w_max": 2.3084834814071655, "w_mean": 1.5087227523326874, "w_min": 3.5522916070634113e-43, "w_std": 0.31092390790581703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3562.4166870117188, "cov_mean": 3.653238718470675e-05, "cov_std": 0.539387047290802, "entropy": 0.45458984375, "epoch": 0.005714285714285714, "grad_norm": 0.30956918001174927, "kl": 3.820657730102539e-05, "learning_rate": 5e-07, "loss": 0.0053, "reward": 0.4479166939854622, "reward_std": 0.5839087814092636, "rewards/accuracy_reward": 0.08333333861082792, "rewards/format_reward": 0.3645833507180214, "step": 5, "w_high_ratio": 0.009932879358530045, "w_low_ratio": 0.061708422377705574, "w_max": 1.4947779774665833, "w_mean": 1.13177028298378, "w_min": 0.0, "w_std": 0.2904536984860897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3346.166748046875, "cov_mean": 2.951182568722288e-05, "cov_std": 0.4695303291082382, "entropy": 0.474609375, "epoch": 0.006857142857142857, "grad_norm": 0.4116966426372528, "kl": 4.678964614868164e-05, "learning_rate": 6e-07, "loss": 0.0655, "reward": 0.40625001303851604, "reward_std": 0.5175340622663498, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.31250001303851604, "step": 6, "w_high_ratio": 0.09942464530467987, "w_low_ratio": 0.05820021778345108, "w_max": 2.0522369146347046, "w_mean": 1.2698509693145752, "w_min": 6.311469302795941e-40, "w_std": 0.3068386148661375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3291.197998046875, "cov_mean": -2.080401827697642e-06, "cov_std": 0.5660274773836136, "entropy": 0.38671875, "epoch": 0.008, "grad_norm": 0.4001730680465698, "kl": 2.8431415557861328e-05, "learning_rate": 7e-07, "loss": -0.0874, "reward": 0.9687500298023224, "reward_std": 0.639276884496212, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.6875000149011612, "step": 7, "w_high_ratio": 0.04338983818888664, "w_low_ratio": 0.05278784967958927, "w_max": 1.6053467988967896, "w_mean": 1.2385202646255493, "w_min": 0.0, "w_std": 0.2744893953204155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2852.4896240234375, "cov_mean": -2.6475029699213337e-05, "cov_std": 0.24741052091121674, "entropy": 0.349365234375, "epoch": 0.009142857142857144, "grad_norm": 0.28081798553466797, "kl": 2.35140323638916e-05, "learning_rate": 8e-07, "loss": -0.0024, "reward": 0.8750000111758709, "reward_std": 0.3533418998122215, "rewards/accuracy_reward": 0.3854166679084301, "rewards/format_reward": 0.48958334885537624, "step": 8, "w_high_ratio": 0.0625, "w_low_ratio": 0.026329820044338703, "w_max": 1.744232177734375, "w_mean": 1.2852342873811722, "w_min": 0.25, "w_std": 0.13892405480146408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3371.2708740234375, "cov_mean": -2.0414277514646528e-05, "cov_std": 0.3537435829639435, "entropy": 0.4619140625, "epoch": 0.010285714285714285, "grad_norm": 0.3546924591064453, "kl": 3.835558891296387e-05, "learning_rate": 9e-07, "loss": -0.0341, "reward": 0.3958333432674408, "reward_std": 0.4515319801867008, "rewards/accuracy_reward": 0.09375000186264515, "rewards/format_reward": 0.3020833358168602, "step": 9, "w_high_ratio": 0.07049691677093506, "w_low_ratio": 0.03988973796367645, "w_max": 1.8283900916576385, "w_mean": 1.2170793116092682, "w_min": 9.553366044251431e-29, "w_std": 0.2274811826646328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2993.8438110351562, "cov_mean": -3.597586055548163e-05, "cov_std": 0.3790554851293564, "entropy": 0.349609375, "epoch": 0.011428571428571429, "grad_norm": 0.4014468193054199, "kl": 3.153085708618164e-05, "learning_rate": 1e-06, "loss": 0.111, "reward": 0.572916679084301, "reward_std": 0.5256113260984421, "rewards/accuracy_reward": 0.15625000651925802, "rewards/format_reward": 0.416666679084301, "step": 10, "w_high_ratio": 0.17509328201413155, "w_low_ratio": 0.04464914742857218, "w_max": 2.3221429884433746, "w_mean": 1.403680145740509, "w_min": 0.0, "w_std": 0.2820280008018017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3645.3126220703125, "cov_mean": 1.727970106912835e-05, "cov_std": 0.314908966422081, "entropy": 0.3701171875, "epoch": 0.012571428571428572, "grad_norm": 0.40044310688972473, "kl": 3.1620264053344727e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0585, "reward": 0.25000001303851604, "reward_std": 0.4806990921497345, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.15625000279396772, "step": 11, "w_high_ratio": 0.0, "w_low_ratio": 0.04341120272874832, "w_max": 1.4488586485385895, "w_mean": 1.097432792186737, "w_min": 4.6695499555262094e-38, "w_std": 0.2005491964519024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2548.0729370117188, "cov_mean": -8.678332051204052e-05, "cov_std": 0.3924334645271301, "entropy": 0.3896484375, "epoch": 0.013714285714285714, "grad_norm": 0.40460628271102905, "kl": 3.9458274841308594e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0329, "reward": 0.9479166939854622, "reward_std": 0.4162924438714981, "rewards/accuracy_reward": 0.23958334140479565, "rewards/format_reward": 0.708333358168602, "step": 12, "w_high_ratio": 0.14796987175941467, "w_low_ratio": 0.038647969253361225, "w_max": 2.0233654975891113, "w_mean": 1.479979693889618, "w_min": 0.0, "w_std": 0.2828930839896202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3180.9896850585938, "cov_mean": -3.604557650760398e-05, "cov_std": 0.29976917430758476, "entropy": 0.39111328125, "epoch": 0.014857142857142857, "grad_norm": 0.42143014073371887, "kl": 2.7954578399658203e-05, "learning_rate": 9.975348529157229e-07, "loss": 0.0007, "reward": 0.5937500298023224, "reward_std": 0.39751993864774704, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.4062500074505806, "step": 13, "w_high_ratio": 0.13092797622084618, "w_low_ratio": 0.038139537908136845, "w_max": 1.9087003767490387, "w_mean": 1.2740049362182617, "w_min": 0.0, "w_std": 0.20897787064313889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3071.760498046875, "cov_mean": -1.3722287803830113e-06, "cov_std": 0.3737764284014702, "entropy": 0.37744140625, "epoch": 0.016, "grad_norm": 0.5302906632423401, "kl": 2.9087066650390625e-05, "learning_rate": 9.956206309337066e-07, "loss": 0.0182, "reward": 0.541666679084301, "reward_std": 0.4254928454756737, "rewards/accuracy_reward": 0.15625000279396772, "rewards/format_reward": 0.3854166679084301, "step": 14, "w_high_ratio": 0.13122042268514633, "w_low_ratio": 0.04646214470267296, "w_max": 2.057934284210205, "w_mean": 1.2967519462108612, "w_min": 6.977258874336181e-23, "w_std": 0.279865525662899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2915.2396240234375, "cov_mean": -2.9782687306578737e-05, "cov_std": 0.3060881793498993, "entropy": 0.3681640625, "epoch": 0.017142857142857144, "grad_norm": 0.471722811460495, "kl": 3.0308961868286133e-05, "learning_rate": 9.931634888554935e-07, "loss": 0.0075, "reward": 0.6145833432674408, "reward_std": 0.3603988029062748, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.4166666679084301, "step": 15, "w_high_ratio": 0.0, "w_low_ratio": 0.03737350553274155, "w_max": 1.5459995865821838, "w_mean": 1.177234023809433, "w_min": 0.0, "w_std": 0.20952722802758217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3857.7084350585938, "cov_mean": -3.300000025774352e-05, "cov_std": 0.18836934491991997, "entropy": 0.45751953125, "epoch": 0.018285714285714287, "grad_norm": 0.31818071007728577, "kl": 3.904104232788086e-05, "learning_rate": 9.901664203302124e-07, "loss": 0.0556, "reward": 0.13541666977107525, "reward_std": 0.249445378780365, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.0833333358168602, "step": 16, "w_high_ratio": 0.0, "w_low_ratio": 0.02626894786953926, "w_max": 1.197378009557724, "w_mean": 1.0219765603542328, "w_min": 0.25, "w_std": 0.10555266216397285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2503.479232788086, "cov_mean": 5.234984382695984e-05, "cov_std": 0.34602249413728714, "entropy": 0.43408203125, "epoch": 0.019428571428571427, "grad_norm": 0.43525052070617676, "kl": 5.2034854888916016e-05, "learning_rate": 9.866330768241983e-07, "loss": 0.0179, "reward": 0.7604167014360428, "reward_std": 0.4241996556520462, "rewards/accuracy_reward": 0.1770833432674408, "rewards/format_reward": 0.5833333432674408, "step": 17, "w_high_ratio": 0.13831434771418571, "w_low_ratio": 0.039523204788565636, "w_max": 2.07596218585968, "w_mean": 1.3631863296031952, "w_min": 0.25, "w_std": 0.22783420607447624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3050.8438110351562, "cov_mean": -8.538075144315371e-05, "cov_std": 0.34513213485479355, "entropy": 0.36328125, "epoch": 0.02057142857142857, "grad_norm": 0.3079332113265991, "kl": 5.075335502624512e-05, "learning_rate": 9.825677631722435e-07, "loss": 0.005, "reward": 0.5833333432674408, "reward_std": 0.4453107975423336, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.4583333469927311, "step": 18, "w_high_ratio": 0.0583355538547039, "w_low_ratio": 0.038819507928565145, "w_max": 1.7474263310432434, "w_mean": 1.2030333578586578, "w_min": 1.0509738482436128e-45, "w_std": 0.20471886917948723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3198.7500610351562, "cov_mean": 6.975242376938695e-05, "cov_std": 0.4591265842318535, "entropy": 0.3994140625, "epoch": 0.021714285714285714, "grad_norm": 0.44368991255760193, "kl": 5.0961971282958984e-05, "learning_rate": 9.779754323328192e-07, "loss": -0.0137, "reward": 0.8541666865348816, "reward_std": 0.6421672403812408, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.479166679084301, "step": 19, "w_high_ratio": 0.07967927679419518, "w_low_ratio": 0.055658016353845596, "w_max": 1.7943021953105927, "w_mean": 1.2339626252651215, "w_min": 0.0, "w_std": 0.28908008337020874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2531.0208740234375, "cov_mean": 3.258255492255557e-05, "cov_std": 0.3959212973713875, "entropy": 0.3134765625, "epoch": 0.022857142857142857, "grad_norm": 0.42710381746292114, "kl": 7.474422454833984e-05, "learning_rate": 9.728616793536587e-07, "loss": -0.0142, "reward": 0.9479166865348816, "reward_std": 0.4626789018511772, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.6875000149011612, "step": 20, "w_high_ratio": 0.20212292298674583, "w_low_ratio": 0.040643465239554644, "w_max": 2.211318254470825, "w_mean": 1.4521130919456482, "w_min": 0.0, "w_std": 0.25691715627908707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2890.541778564453, "cov_mean": 1.6164100088644773e-05, "cov_std": 0.2818361334502697, "entropy": 0.421875, "epoch": 0.024, "grad_norm": 0.8042517900466919, "kl": 0.0001793205738067627, "learning_rate": 9.672327345550543e-07, "loss": 0.073, "reward": 0.697916692122817, "reward_std": 0.3805258348584175, "rewards/accuracy_reward": 0.22916667070239782, "rewards/format_reward": 0.4687500102445483, "step": 21, "w_high_ratio": 0.1653403341770172, "w_low_ratio": 0.034426179714500904, "w_max": 2.0890542566776276, "w_mean": 1.4488303065299988, "w_min": 0.25, "w_std": 0.26479026675224304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2023.8750610351562, "cov_mean": -3.984599959494517e-05, "cov_std": 0.3324627988040447, "entropy": 0.41162109375, "epoch": 0.025142857142857144, "grad_norm": 0.43941012024879456, "kl": 0.0001862645149230957, "learning_rate": 9.610954559391704e-07, "loss": 0.0018, "reward": 1.0416666865348816, "reward_std": 0.32273583114147186, "rewards/accuracy_reward": 0.2708333386108279, "rewards/format_reward": 0.7708333730697632, "step": 22, "w_high_ratio": 0.16147570684552193, "w_low_ratio": 0.026547667337581515, "w_max": 2.171365201473236, "w_mean": 1.5571591556072235, "w_min": 2.1019476964872256e-45, "w_std": 0.16451371088624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2749.6146850585938, "cov_mean": -4.101629156139097e-05, "cov_std": 0.25967343151569366, "entropy": 0.37939453125, "epoch": 0.026285714285714287, "grad_norm": 0.37253376841545105, "kl": 0.00014030933380126953, "learning_rate": 9.54457320834625e-07, "loss": -0.0678, "reward": 0.5625000223517418, "reward_std": 0.3055335730314255, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.4479166679084301, "step": 23, "w_high_ratio": 0.14202075079083443, "w_low_ratio": 0.028185136150568724, "w_max": 2.1473127901554108, "w_mean": 1.35337632894516, "w_min": 1.3966908548442706e-38, "w_std": 0.21078352630138397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3073.7813110351562, "cov_mean": -2.623773912091565e-05, "cov_std": 0.5731624215841293, "entropy": 0.365234375, "epoch": 0.027428571428571427, "grad_norm": 0.6742011904716492, "kl": 0.00015079975128173828, "learning_rate": 9.473264167865171e-07, "loss": -0.0491, "reward": 0.7916666939854622, "reward_std": 0.7121171355247498, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.510416679084301, "step": 24, "w_high_ratio": 0.15864675119519234, "w_low_ratio": 0.057481554336845875, "w_max": 2.1011292338371277, "w_mean": 1.4009218215942383, "w_min": 0.0, "w_std": 0.3654456064105034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3008.9271850585938, "cov_mean": -5.06913784192875e-05, "cov_std": 0.360674187541008, "entropy": 0.43896484375, "epoch": 0.02857142857142857, "grad_norm": 0.6551496982574463, "kl": 0.00020998716354370117, "learning_rate": 9.397114317029974e-07, "loss": 0.0466, "reward": 0.5208333432674408, "reward_std": 0.4276355504989624, "rewards/accuracy_reward": 0.1354166679084301, "rewards/format_reward": 0.3854166865348816, "step": 25, "w_high_ratio": 0.10193426162004471, "w_low_ratio": 0.04761309362947941, "w_max": 2.144305258989334, "w_mean": 1.3412529230117798, "w_min": 0.25, "w_std": 0.2752615138888359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3288.4896240234375, "cov_mean": 1.0494537946215132e-05, "cov_std": 0.2694205194711685, "entropy": 0.423828125, "epoch": 0.029714285714285714, "grad_norm": 0.2771300673484802, "kl": 4.094839096069336e-05, "learning_rate": 9.316216432703916e-07, "loss": -0.0308, "reward": 0.6875000298023224, "reward_std": 0.3060605004429817, "rewards/accuracy_reward": 0.2395833358168602, "rewards/format_reward": 0.447916679084301, "step": 26, "w_high_ratio": 0.0, "w_low_ratio": 0.026628307532519102, "w_max": 1.6005243062973022, "w_mean": 1.1837812960147858, "w_min": 0.0, "w_std": 0.15812482312321663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3304.0625610351562, "cov_mean": 2.6395198347017867e-05, "cov_std": 0.47998297959566116, "entropy": 0.43408203125, "epoch": 0.030857142857142857, "grad_norm": 0.3476252257823944, "kl": 0.0001595616340637207, "learning_rate": 9.230669076497687e-07, "loss": -0.0198, "reward": 0.614583358168602, "reward_std": 0.5712436102330685, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.4479166716337204, "step": 27, "w_high_ratio": 0.12017197906970978, "w_low_ratio": 0.05561595968902111, "w_max": 1.8565902709960938, "w_mean": 1.2779352962970734, "w_min": 0.0, "w_std": 0.2663590759038925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3088.8854370117188, "cov_mean": -3.223641306249192e-05, "cov_std": 0.42785073816776276, "entropy": 0.4013671875, "epoch": 0.032, "grad_norm": 0.33925580978393555, "kl": 0.00015425682067871094, "learning_rate": 9.140576474687263e-07, "loss": 0.0176, "reward": 0.739583358168602, "reward_std": 0.5647517889738083, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.4479166865348816, "step": 28, "w_high_ratio": 0.049459055066108704, "w_low_ratio": 0.046367804519832134, "w_max": 1.811535805463791, "w_mean": 1.2396393418312073, "w_min": 0.0, "w_std": 0.264127716422081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3654.9375610351562, "cov_mean": -5.409401546785375e-05, "cov_std": 0.37247660756111145, "entropy": 0.4443359375, "epoch": 0.03314285714285714, "grad_norm": 0.3806890547275543, "kl": 0.00038933753967285156, "learning_rate": 9.046048391230247e-07, "loss": 0.0781, "reward": 0.28125, "reward_std": 0.4390515610575676, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.18750000558793545, "step": 29, "w_high_ratio": 0.0, "w_low_ratio": 0.04866650328040123, "w_max": 1.3806794583797455, "w_mean": 1.0922435522079468, "w_min": 0.25, "w_std": 0.21911596134305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3306.5938110351562, "cov_mean": -2.3586735551361926e-05, "cov_std": 0.4439524784684181, "entropy": 0.37841796875, "epoch": 0.03428571428571429, "grad_norm": 0.49297624826431274, "kl": 0.0008985996246337891, "learning_rate": 8.9471999940354e-07, "loss": -0.0256, "reward": 0.6458333544433117, "reward_std": 0.5611635595560074, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.4270833395421505, "step": 30, "w_high_ratio": 0.06216667778789997, "w_low_ratio": 0.05258181784301996, "w_max": 1.9480324983596802, "w_mean": 1.2739951610565186, "w_min": 4.3700652660890127e-35, "w_std": 0.28470994904637337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3249.072998046875, "cov_mean": 7.066663692967268e-05, "cov_std": 0.31172432936728, "entropy": 0.38232421875, "epoch": 0.03542857142857143, "grad_norm": 0.41590920090675354, "kl": 0.0011307001113891602, "learning_rate": 8.844151714648274e-07, "loss": -0.009, "reward": 0.541666679084301, "reward_std": 0.48244282230734825, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.34375000558793545, "step": 31, "w_high_ratio": 0.09930127486586571, "w_low_ratio": 0.04017635714262724, "w_max": 2.09058153629303, "w_mean": 1.2682196497917175, "w_min": 0.0, "w_std": 0.2415708377957344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3569.0626220703125, "cov_mean": -2.6002062440966256e-06, "cov_std": 0.3952501490712166, "entropy": 0.43212890625, "epoch": 0.036571428571428574, "grad_norm": 1.6052252054214478, "kl": 0.03867650032043457, "learning_rate": 8.737029101523929e-07, "loss": -0.0005, "reward": 0.6979166865348816, "reward_std": 0.5698855072259903, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.3645833432674408, "step": 32, "w_high_ratio": 0.0, "w_low_ratio": 0.04428828274831176, "w_max": 1.4899851083755493, "w_mean": 1.1257199943065643, "w_min": 0.0, "w_std": 0.2084966115653515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3602.0313110351562, "cov_mean": 3.6438897495827405e-05, "cov_std": 0.35359735041856766, "entropy": 0.36865234375, "epoch": 0.037714285714285714, "grad_norm": 0.4356963038444519, "kl": 0.006441354751586914, "learning_rate": 8.625962667065487e-07, "loss": 0.0063, "reward": 0.6041666977107525, "reward_std": 0.5775354653596878, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.35416667722165585, "step": 33, "w_high_ratio": 0.0, "w_low_ratio": 0.04679631860926747, "w_max": 1.4279894530773163, "w_mean": 1.114120066165924, "w_min": 1.083308810307908e-39, "w_std": 0.1986728459596634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2617.0834045410156, "cov_mean": 6.101907820266206e-05, "cov_std": 0.4060870446264744, "entropy": 0.42529296875, "epoch": 0.038857142857142854, "grad_norm": 0.6856685876846313, "kl": 0.0007038116455078125, "learning_rate": 8.511087728614862e-07, "loss": -0.0313, "reward": 0.9583333432674408, "reward_std": 0.4463158957660198, "rewards/accuracy_reward": 0.40625001303851604, "rewards/format_reward": 0.5520833535119891, "step": 34, "w_high_ratio": 0.24574057757854462, "w_low_ratio": 0.03856370970606804, "w_max": 2.4257175028324127, "w_mean": 1.564720779657364, "w_min": 9.308517412847608e-40, "w_std": 0.33169806748628616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3259.072998046875, "cov_mean": 6.144623966974905e-06, "cov_std": 0.4504075199365616, "entropy": 0.41455078125, "epoch": 0.04, "grad_norm": 0.41887158155441284, "kl": 0.0009140968322753906, "learning_rate": 8.392544243589427e-07, "loss": 0.0587, "reward": 0.5312500111758709, "reward_std": 0.6110180467367172, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.34375001303851604, "step": 35, "w_high_ratio": 0.0, "w_low_ratio": 0.055781897623091936, "w_max": 1.4624531269073486, "w_mean": 1.1014132499694824, "w_min": 5.8279178943564365e-36, "w_std": 0.2577386908233166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3602.3438110351562, "cov_mean": 2.4346391001017764e-05, "cov_std": 0.21211356669664383, "entropy": 0.49267578125, "epoch": 0.04114285714285714, "grad_norm": 0.5710666179656982, "kl": 0.0008481144905090332, "learning_rate": 8.270476638965461e-07, "loss": -0.0327, "reward": 0.19791667442768812, "reward_std": 0.16615793853998184, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.18750001024454832, "step": 36, "w_high_ratio": 0.04688615724444389, "w_low_ratio": 0.019336079712957144, "w_max": 1.5947359800338745, "w_mean": 1.155133068561554, "w_min": 0.25, "w_std": 0.13783840090036392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3618.604248046875, "cov_mean": 3.312654916953761e-05, "cov_std": 0.2396765574812889, "entropy": 0.41943359375, "epoch": 0.04228571428571429, "grad_norm": 0.35426977276802063, "kl": 0.0006122589111328125, "learning_rate": 8.145033635316128e-07, "loss": -0.0109, "reward": 0.19791667256504297, "reward_std": 0.22218847274780273, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.18750000558793545, "step": 37, "w_high_ratio": 0.004357387777417898, "w_low_ratio": 0.026776093989610672, "w_max": 1.527068942785263, "w_mean": 1.1089930832386017, "w_min": 0.0, "w_std": 0.156088937073946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3681.0625610351562, "cov_mean": -1.270252869289834e-05, "cov_std": 0.19421957433223724, "entropy": 0.44921875, "epoch": 0.04342857142857143, "grad_norm": 0.24127696454524994, "kl": 0.00042241811752319336, "learning_rate": 8.01636806561836e-07, "loss": 0.0018, "reward": 0.25, "reward_std": 0.287552148103714, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.13541666977107525, "step": 38, "w_high_ratio": 0.041594721376895905, "w_low_ratio": 0.02351229265332222, "w_max": 1.402332603931427, "w_mean": 1.097372442483902, "w_min": 0.5, "w_std": 0.1229349672794342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3038.5626220703125, "cov_mean": 2.8159271096228622e-05, "cov_std": 0.31503692269325256, "entropy": 0.364013671875, "epoch": 0.044571428571428574, "grad_norm": 0.2635081708431244, "kl": 0.001262664794921875, "learning_rate": 7.884636689049422e-07, "loss": -0.0133, "reward": 0.7291666865348816, "reward_std": 0.26209891587495804, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.5000000149011612, "step": 39, "w_high_ratio": 0.0, "w_low_ratio": 0.027915622107684612, "w_max": 1.4430480003356934, "w_mean": 1.1478222012519836, "w_min": 0.25, "w_std": 0.14645230770111084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2716.6876220703125, "cov_mean": 1.541716937936144e-05, "cov_std": 0.37162280082702637, "entropy": 0.36669921875, "epoch": 0.045714285714285714, "grad_norm": 0.27566489577293396, "kl": 0.0022230148315429688, "learning_rate": 7.75e-07, "loss": -0.0203, "reward": 0.729166679084301, "reward_std": 0.3497198149561882, "rewards/accuracy_reward": 0.15625000093132257, "rewards/format_reward": 0.5729166716337204, "step": 40, "w_high_ratio": 0.05738469213247299, "w_low_ratio": 0.03804673533886671, "w_max": 1.8830105662345886, "w_mean": 1.3182978928089142, "w_min": 0.0, "w_std": 0.2204515039920807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3265.8334350585938, "cov_mean": 4.474487604966271e-05, "cov_std": 0.3514489606022835, "entropy": 0.38134765625, "epoch": 0.046857142857142854, "grad_norm": 0.25198379158973694, "kl": 0.00047135353088378906, "learning_rate": 7.612622032536507e-07, "loss": -0.0348, "reward": 0.5625000204890966, "reward_std": 0.5001779943704605, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.41666669212281704, "step": 41, "w_high_ratio": 0.060201918706297874, "w_low_ratio": 0.03864650521427393, "w_max": 1.765150785446167, "w_mean": 1.2053866684436798, "w_min": 1.0444519155498174e-26, "w_std": 0.20880188420414925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3039.260498046875, "cov_mean": 4.992843923901091e-05, "cov_std": 0.12326429784297943, "entropy": 0.4560546875, "epoch": 0.048, "grad_norm": 0.10436006635427475, "kl": 0.0007390975952148438, "learning_rate": 7.472670160550848e-07, "loss": 0.0037, "reward": 0.3333333358168602, "reward_std": 0.15885811299085617, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.31250000186264515, "step": 42, "w_high_ratio": 0.0625, "w_low_ratio": 0.016482737846672535, "w_max": 1.6183056831359863, "w_mean": 1.1592219173908234, "w_min": 0.5, "w_std": 0.06949653849005699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3322.5208740234375, "cov_mean": 1.5148519764807133e-05, "cov_std": 0.3199145011603832, "entropy": 0.39794921875, "epoch": 0.04914285714285714, "grad_norm": 0.22395218908786774, "kl": 0.0014238357543945312, "learning_rate": 7.330314893841101e-07, "loss": 0.0251, "reward": 0.45833334140479565, "reward_std": 0.34082313999533653, "rewards/accuracy_reward": 0.16666666977107525, "rewards/format_reward": 0.2916666716337204, "step": 43, "w_high_ratio": 0.0, "w_low_ratio": 0.03815819416195154, "w_max": 1.3987224996089935, "w_mean": 1.0974957346916199, "w_min": 0.0, "w_std": 0.18725593388080597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2873.947967529297, "cov_mean": 1.1262121915933676e-05, "cov_std": 0.3456302881240845, "entropy": 0.37548828125, "epoch": 0.05028571428571429, "grad_norm": 0.570351243019104, "kl": 0.0011093616485595703, "learning_rate": 7.185729670371604e-07, "loss": -0.0802, "reward": 0.7500000447034836, "reward_std": 0.31945212185382843, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.4687500074505806, "step": 44, "w_high_ratio": 0.15513932332396507, "w_low_ratio": 0.028022687416523695, "w_max": 2.1701363921165466, "w_mean": 1.4684883952140808, "w_min": 0.25, "w_std": 0.245870441198349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3659.1355590820312, "cov_mean": -1.3919307093601674e-05, "cov_std": 0.3560608774423599, "entropy": 0.4130859375, "epoch": 0.05142857142857143, "grad_norm": 0.33030757308006287, "kl": 0.0011370182037353516, "learning_rate": 7.039090644965509e-07, "loss": 0.0303, "reward": 0.416666679084301, "reward_std": 0.5311296693980694, "rewards/accuracy_reward": 0.16666667349636555, "rewards/format_reward": 0.25, "step": 45, "w_high_ratio": 0.0, "w_low_ratio": 0.04757110681384802, "w_max": 1.4556901454925537, "w_mean": 1.08749720454216, "w_min": 0.0, "w_std": 0.21380594745278358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3486.7916870117188, "cov_mean": 2.1816805514163207e-05, "cov_std": 0.2602475844323635, "entropy": 0.4794921875, "epoch": 0.052571428571428575, "grad_norm": 0.28776484727859497, "kl": 0.0007176399230957031, "learning_rate": 6.890576474687263e-07, "loss": 0.058, "reward": 0.23958334233611822, "reward_std": 0.2835810258984566, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.20833333488553762, "step": 46, "w_high_ratio": 0.0, "w_low_ratio": 0.03342714952304959, "w_max": 1.2925868034362793, "w_mean": 1.0403871834278107, "w_min": 1.838248673476915e-29, "w_std": 0.14680924825370312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3093.4479370117188, "cov_mean": -4.72289493700373e-05, "cov_std": 0.39596526324748993, "entropy": 0.3857421875, "epoch": 0.053714285714285714, "grad_norm": 0.3468870520591736, "kl": 0.0010945796966552734, "learning_rate": 6.740368101176495e-07, "loss": -0.064, "reward": 0.84375, "reward_std": 0.5307980924844742, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.5208333432674408, "step": 47, "w_high_ratio": 0.10172786563634872, "w_low_ratio": 0.03984384797513485, "w_max": 1.9562607407569885, "w_mean": 1.385090559720993, "w_min": 0.25, "w_std": 0.24371833354234695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2902.979278564453, "cov_mean": -2.4175317776098382e-05, "cov_std": 0.4257803037762642, "entropy": 0.388671875, "epoch": 0.054857142857142854, "grad_norm": 0.5910794734954834, "kl": 0.002681732177734375, "learning_rate": 6.588648530198504e-07, "loss": -0.0371, "reward": 0.6562500074505806, "reward_std": 0.493436336517334, "rewards/accuracy_reward": 0.22916666697710752, "rewards/format_reward": 0.42708334140479565, "step": 48, "w_high_ratio": 0.05750561133027077, "w_low_ratio": 0.0480266478843987, "w_max": 1.9184067249298096, "w_mean": 1.2389512956142426, "w_min": 9.458764634192515e-45, "w_std": 0.2734759133309126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2592.4271545410156, "cov_mean": -5.175127171241911e-06, "cov_std": 0.3634071573615074, "entropy": 0.36669921875, "epoch": 0.056, "grad_norm": 0.33904796838760376, "kl": 0.006507396697998047, "learning_rate": 6.435602608679916e-07, "loss": 0.0033, "reward": 0.8437500447034836, "reward_std": 0.5799632221460342, "rewards/accuracy_reward": 0.27083333767950535, "rewards/format_reward": 0.5729166865348816, "step": 49, "w_high_ratio": 0.0624999962747097, "w_low_ratio": 0.04148435592651367, "w_max": 1.7326014041900635, "w_mean": 1.239928662776947, "w_min": 4.925564102101732e-43, "w_std": 0.23503416404128075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3253.1355590820312, "cov_mean": -2.146356928278692e-05, "cov_std": 0.2279180847108364, "entropy": 0.35498046875, "epoch": 0.05714285714285714, "grad_norm": 0.15721456706523895, "kl": 0.0008764266967773438, "learning_rate": 6.281416799501187e-07, "loss": -0.0086, "reward": 0.635416679084301, "reward_std": 0.3225880041718483, "rewards/accuracy_reward": 0.2708333386108279, "rewards/format_reward": 0.3645833395421505, "step": 50, "w_high_ratio": 0.0625, "w_low_ratio": 0.028236051555722952, "w_max": 1.5944055318832397, "w_mean": 1.1758275628089905, "w_min": 0.25, "w_std": 0.12848308496177197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2558.1043090820312, "cov_mean": 5.439032975118607e-05, "cov_std": 0.29480236768722534, "entropy": 0.43408203125, "epoch": 0.05828571428571429, "grad_norm": 0.38427427411079407, "kl": 0.00363922119140625, "learning_rate": 6.126278954320294e-07, "loss": 0.0141, "reward": 0.6666666865348816, "reward_std": 0.2830107621848583, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.53125, "step": 51, "w_high_ratio": 0.0, "w_low_ratio": 0.029488239903002977, "w_max": 1.5800293982028961, "w_mean": 1.1582573056221008, "w_min": 0.25, "w_std": 0.16206533834338188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3121.510467529297, "cov_mean": 4.514171632763464e-05, "cov_std": 0.3024504631757736, "entropy": 0.38525390625, "epoch": 0.05942857142857143, "grad_norm": 0.38342365622520447, "kl": 0.0027284622192382812, "learning_rate": 5.97037808470444e-07, "loss": -0.0158, "reward": 0.6666666772216558, "reward_std": 0.48630647361278534, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.41666668467223644, "step": 52, "w_high_ratio": 0.0, "w_low_ratio": 0.048233418725430965, "w_max": 1.4871755540370941, "w_mean": 1.1300460696220398, "w_min": 0.0, "w_std": 0.221306212246418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3089.6146850585938, "cov_mean": 1.5540852473350242e-05, "cov_std": 0.4309914745390415, "entropy": 0.42138671875, "epoch": 0.060571428571428575, "grad_norm": 0.32499778270721436, "kl": 0.001051187515258789, "learning_rate": 5.813904131848564e-07, "loss": -0.0334, "reward": 0.9062500447034836, "reward_std": 0.615619845688343, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.6041666865348816, "step": 53, "w_high_ratio": 0.0, "w_low_ratio": 0.05247905198484659, "w_max": 1.4815186858177185, "w_mean": 1.1437698602676392, "w_min": 0.0, "w_std": 0.2556675784289837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2948.479248046875, "cov_mean": 5.7689636832947144e-05, "cov_std": 0.4967670738697052, "entropy": 0.3662109375, "epoch": 0.061714285714285715, "grad_norm": 0.5084431767463684, "kl": 0.0007784366607666016, "learning_rate": 5.657047735161255e-07, "loss": -0.0313, "reward": 1.0208333730697632, "reward_std": 0.6375530436635017, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.5833333507180214, "step": 54, "w_high_ratio": 0.12949026003479958, "w_low_ratio": 0.03830569516867399, "w_max": 1.9918950200080872, "w_mean": 1.3933196365833282, "w_min": 0.0, "w_std": 0.25901878997683525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3389.4376220703125, "cov_mean": 1.1409799526518327e-05, "cov_std": 0.2976246848702431, "entropy": 0.40380859375, "epoch": 0.06285714285714286, "grad_norm": 0.2319922149181366, "kl": 0.0006909370422363281, "learning_rate": 5.5e-07, "loss": 0.0036, "reward": 0.614583358168602, "reward_std": 0.4451694190502167, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.3645833432674408, "step": 55, "w_high_ratio": 0.008611755445599556, "w_low_ratio": 0.038189588114619255, "w_max": 1.4636406004428864, "w_mean": 1.113456517457962, "w_min": 0.25, "w_std": 0.1765221506357193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3142.2188720703125, "cov_mean": -1.977225656446535e-05, "cov_std": 0.4043290466070175, "entropy": 0.36962890625, "epoch": 0.064, "grad_norm": 0.5777710676193237, "kl": 0.003372669219970703, "learning_rate": 5.342952264838747e-07, "loss": -0.0624, "reward": 0.666666679084301, "reward_std": 0.36768075451254845, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.4583333358168602, "step": 56, "w_high_ratio": 0.02573772892355919, "w_low_ratio": 0.0384283890016377, "w_max": 1.6343314349651337, "w_mean": 1.174754112958908, "w_min": 0.0, "w_std": 0.2166101150214672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3526.041748046875, "cov_mean": 5.059504655946512e-05, "cov_std": 0.19530736654996872, "entropy": 0.3095703125, "epoch": 0.06514285714285714, "grad_norm": 0.14592108130455017, "kl": 0.0003554821014404297, "learning_rate": 5.186095868151436e-07, "loss": 0.0068, "reward": 0.4270833432674408, "reward_std": 0.2942384257912636, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.3125, "step": 57, "w_high_ratio": 0.0, "w_low_ratio": 0.024141859263181686, "w_max": 1.4007738828659058, "w_mean": 1.1122069656848907, "w_min": 0.25, "w_std": 0.10743825510144234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2448.125030517578, "cov_mean": 2.2156835257192142e-05, "cov_std": 0.41437317430973053, "entropy": 0.3251953125, "epoch": 0.06628571428571428, "grad_norm": 0.31742021441459656, "kl": 0.003879547119140625, "learning_rate": 5.02962191529556e-07, "loss": -0.0288, "reward": 1.0625000223517418, "reward_std": 0.4887799397110939, "rewards/accuracy_reward": 0.3437500037252903, "rewards/format_reward": 0.7187500074505806, "step": 58, "w_high_ratio": 0.0625, "w_low_ratio": 0.03917268430814147, "w_max": 1.9121226966381073, "w_mean": 1.4166812300682068, "w_min": 0.0, "w_std": 0.25635192170739174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3166.6458740234375, "cov_mean": -4.421618541528005e-06, "cov_std": 0.23622582852840424, "entropy": 0.33251953125, "epoch": 0.06742857142857143, "grad_norm": 0.5350203514099121, "kl": 0.0004105567932128906, "learning_rate": 4.873721045679706e-07, "loss": 0.0178, "reward": 0.572916672565043, "reward_std": 0.32521966844797134, "rewards/accuracy_reward": 0.20833333861082792, "rewards/format_reward": 0.36458334140479565, "step": 59, "w_high_ratio": 0.09398643299937248, "w_low_ratio": 0.029231622349470854, "w_max": 1.7523704767227173, "w_mean": 1.1871490776538849, "w_min": 0.0, "w_std": 0.18543793261051178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3290.9688720703125, "cov_mean": 2.4174340069293976e-05, "cov_std": 0.2900906167924404, "entropy": 0.37353515625, "epoch": 0.06857142857142857, "grad_norm": 0.3470577299594879, "kl": 0.002052783966064453, "learning_rate": 4.7185832004988133e-07, "loss": 0.0597, "reward": 0.4895833395421505, "reward_std": 0.39402854442596436, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.38541667722165585, "step": 60, "w_high_ratio": 0.0, "w_low_ratio": 0.03313248883932829, "w_max": 1.4663892686367035, "w_mean": 1.140129953622818, "w_min": 0.0, "w_std": 0.1620137356221676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3175.916748046875, "cov_mean": 4.250624078849796e-05, "cov_std": 0.4415072202682495, "entropy": 0.3564453125, "epoch": 0.06971428571428571, "grad_norm": 0.3775484263896942, "kl": 0.0009458065032958984, "learning_rate": 4.5643973913200837e-07, "loss": -0.0396, "reward": 0.791666679084301, "reward_std": 0.4833543188869953, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.541666679084301, "step": 61, "w_high_ratio": 0.04859437793493271, "w_low_ratio": 0.03715647594071925, "w_max": 1.7687608003616333, "w_mean": 1.2712468802928925, "w_min": 0.0, "w_std": 0.24916893057525158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2722.5313110351562, "cov_mean": 3.7332507645260193e-06, "cov_std": 0.3699860963970423, "entropy": 0.30859375, "epoch": 0.07085714285714285, "grad_norm": 1.0045063495635986, "kl": 0.02886199951171875, "learning_rate": 4.4113514698014953e-07, "loss": 0.0303, "reward": 0.8750000298023224, "reward_std": 0.5333737134933472, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.583333358168602, "step": 62, "w_high_ratio": 0.06600858364254236, "w_low_ratio": 0.04690547380596399, "w_max": 1.9925485253334045, "w_mean": 1.3004825711250305, "w_min": 0.0, "w_std": 0.25962154380977154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2446.8959045410156, "cov_mean": -3.0669682018924505e-05, "cov_std": 0.44857871532440186, "entropy": 0.3876953125, "epoch": 0.072, "grad_norm": 0.4270949065685272, "kl": 0.001964569091796875, "learning_rate": 4.2596318988235037e-07, "loss": -0.0266, "reward": 1.0208333879709244, "reward_std": 0.5118110477924347, "rewards/accuracy_reward": 0.3229166781529784, "rewards/format_reward": 0.6979166716337204, "step": 63, "w_high_ratio": 0.24508000910282135, "w_low_ratio": 0.04229559004306793, "w_max": 2.53433358669281, "w_mean": 1.5855962336063385, "w_min": 3.1441053104750004e-38, "w_std": 0.3284341022372246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3022.9896240234375, "cov_mean": -4.004061929663294e-05, "cov_std": 0.24233128875494003, "entropy": 0.376953125, "epoch": 0.07314285714285715, "grad_norm": 0.22661255300045013, "kl": 0.0009641647338867188, "learning_rate": 4.1094235253127374e-07, "loss": 0.0441, "reward": 0.6666666977107525, "reward_std": 0.42464151978492737, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.41666668467223644, "step": 64, "w_high_ratio": 0.05784625560045242, "w_low_ratio": 0.024026920087635517, "w_max": 1.9166311621665955, "w_mean": 1.3068864345550537, "w_min": 0.0, "w_std": 0.15533896535634995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2987.9584350585938, "cov_mean": -1.1828518722722947e-05, "cov_std": 0.3229193612933159, "entropy": 0.33544921875, "epoch": 0.07428571428571429, "grad_norm": 0.20347988605499268, "kl": 0.0009045600891113281, "learning_rate": 3.9609093550344907e-07, "loss": 0.0257, "reward": 0.7187500037252903, "reward_std": 0.4016053378582001, "rewards/accuracy_reward": 0.2500000027939677, "rewards/format_reward": 0.46875, "step": 65, "w_high_ratio": 0.0625, "w_low_ratio": 0.04217576887458563, "w_max": 1.5320913791656494, "w_mean": 1.177913784980774, "w_min": 0.0, "w_std": 0.17242734879255295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2277.3542709350586, "cov_mean": 2.5952657381367317e-05, "cov_std": 0.2567654103040695, "entropy": 0.30517578125, "epoch": 0.07542857142857143, "grad_norm": 0.39114055037498474, "kl": 0.0011463165283203125, "learning_rate": 3.8142703296283953e-07, "loss": 0.0283, "reward": 0.8750000111758709, "reward_std": 0.32678278163075447, "rewards/accuracy_reward": 0.35416667349636555, "rewards/format_reward": 0.520833333954215, "step": 66, "w_high_ratio": 0.050385382026433945, "w_low_ratio": 0.036725505255162716, "w_max": 1.6868340969085693, "w_mean": 1.2045796811580658, "w_min": 4.016504513755072e-38, "w_std": 0.18733475357294083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3692.8646240234375, "cov_mean": -1.3762917205895064e-05, "cov_std": 0.1253320723772049, "entropy": 0.3369140625, "epoch": 0.07657142857142857, "grad_norm": 0.09913324564695358, "kl": 0.0007886886596679688, "learning_rate": 3.6696851061588994e-07, "loss": -0.0047, "reward": 0.1979166716337204, "reward_std": 0.14884886890649796, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.1666666716337204, "step": 67, "w_high_ratio": 0.0, "w_low_ratio": 0.014627222903072834, "w_max": 1.3194924890995026, "w_mean": 1.0943252593278885, "w_min": 0.5, "w_std": 0.08329889550805092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2192.4896240234375, "cov_mean": 5.210197195992805e-05, "cov_std": 0.2974618822336197, "entropy": 0.3623046875, "epoch": 0.07771428571428571, "grad_norm": 0.48533204197883606, "kl": 0.004261016845703125, "learning_rate": 3.5273298394491515e-07, "loss": -0.0773, "reward": 0.864583358168602, "reward_std": 0.3760298416018486, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.625, "step": 68, "w_high_ratio": 0.2266840934753418, "w_low_ratio": 0.028733241837471724, "w_max": 2.714290827512741, "w_mean": 1.5712151527404785, "w_min": 0.25, "w_std": 0.22248514741659164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2785.791748046875, "cov_mean": 5.550627793127205e-05, "cov_std": 0.24297186359763145, "entropy": 0.435546875, "epoch": 0.07885714285714286, "grad_norm": 0.4455502927303314, "kl": 0.003231048583984375, "learning_rate": 3.387377967463493e-07, "loss": -0.0721, "reward": 0.4791666716337204, "reward_std": 0.22700205445289612, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.4479166716337204, "step": 69, "w_high_ratio": 0.18034584820270538, "w_low_ratio": 0.019336777739226818, "w_max": 2.187383383512497, "w_mean": 1.3382967710494995, "w_min": 0.25, "w_std": 0.14556573703885078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3339.760498046875, "cov_mean": 3.0603625077674224e-05, "cov_std": 0.3574352115392685, "entropy": 0.35400390625, "epoch": 0.08, "grad_norm": 0.4004529118537903, "kl": 0.0030617713928222656, "learning_rate": 3.250000000000001e-07, "loss": 0.0435, "reward": 0.48958334513008595, "reward_std": 0.448788546025753, "rewards/accuracy_reward": 0.1145833395421505, "rewards/format_reward": 0.37500001676380634, "step": 70, "w_high_ratio": 0.056590817868709564, "w_low_ratio": 0.04363738652318716, "w_max": 1.621414452791214, "w_mean": 1.1405479907989502, "w_min": 0.0, "w_std": 0.21908994019031525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2994.0834197998047, "cov_mean": 5.412803147919476e-06, "cov_std": 0.24315955862402916, "entropy": 0.39501953125, "epoch": 0.08114285714285714, "grad_norm": 0.43441104888916016, "kl": 0.003234386444091797, "learning_rate": 3.115363310950578e-07, "loss": 0.0297, "reward": 0.4791666669771075, "reward_std": 0.2685965895652771, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.3333333460614085, "step": 71, "w_high_ratio": 0.05637816712260246, "w_low_ratio": 0.024559800047427416, "w_max": 1.8640311062335968, "w_mean": 1.2403113842010498, "w_min": 0.25, "w_std": 0.18912436068058014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3187.5313110351562, "cov_mean": -5.872446490684524e-05, "cov_std": 0.47990038990974426, "entropy": 0.45556640625, "epoch": 0.08228571428571428, "grad_norm": 0.49201899766921997, "kl": 0.0031557083129882812, "learning_rate": 2.9836319343816397e-07, "loss": -0.0603, "reward": 0.510416679084301, "reward_std": 0.46190567314624786, "rewards/accuracy_reward": 0.07291666697710752, "rewards/format_reward": 0.4375000074505806, "step": 72, "w_high_ratio": 0.13975006341934204, "w_low_ratio": 0.04982540290802717, "w_max": 1.821915477514267, "w_mean": 1.327934205532074, "w_min": 0.0, "w_std": 0.2968912795186043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3788.3125610351562, "cov_mean": -2.8033528451487655e-05, "cov_std": 0.27588948607444763, "entropy": 0.44091796875, "epoch": 0.08342857142857144, "grad_norm": 0.1494845449924469, "kl": 0.00031256675720214844, "learning_rate": 2.854966364683872e-07, "loss": -0.0024, "reward": 0.3541666716337204, "reward_std": 0.38621756434440613, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.2187500037252903, "step": 73, "w_high_ratio": 0.0, "w_low_ratio": 0.03520650416612625, "w_max": 1.26780503988266, "w_mean": 1.0412998497486115, "w_min": 0.25, "w_std": 0.1442563608288765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3536.3751220703125, "cov_mean": 5.950678314547986e-05, "cov_std": 0.23448628932237625, "entropy": 0.3876953125, "epoch": 0.08457142857142858, "grad_norm": 0.26546525955200195, "kl": 0.0013303756713867188, "learning_rate": 2.729523361034538e-07, "loss": 0.0267, "reward": 0.4791666865348816, "reward_std": 0.3228915072977543, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.2604166716337204, "step": 74, "w_high_ratio": 0.0, "w_low_ratio": 0.030950261279940605, "w_max": 1.3889389336109161, "w_mean": 1.0708437263965607, "w_min": 0.0, "w_std": 0.1411808580160141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3248.729248046875, "cov_mean": -2.3449571926903445e-05, "cov_std": 0.35559114813804626, "entropy": 0.35546875, "epoch": 0.08571428571428572, "grad_norm": 0.23657207190990448, "kl": 0.0014653205871582031, "learning_rate": 2.6074557564105724e-07, "loss": 0.0671, "reward": 0.5625000111758709, "reward_std": 0.37235569953918457, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.3750000037252903, "step": 75, "w_high_ratio": 0.05818156525492668, "w_low_ratio": 0.04433906823396683, "w_max": 1.794895738363266, "w_mean": 1.1899544298648834, "w_min": 0.25, "w_std": 0.21789918839931488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3163.4583740234375, "cov_mean": 1.617619454918895e-05, "cov_std": 0.1999940201640129, "entropy": 0.37890625, "epoch": 0.08685714285714285, "grad_norm": 0.1549980193376541, "kl": 0.00043952465057373047, "learning_rate": 2.488912271385139e-07, "loss": 0.016, "reward": 0.4479166865348816, "reward_std": 0.24646350741386414, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.4166666828095913, "step": 76, "w_high_ratio": 0.0625, "w_low_ratio": 0.027174705173820257, "w_max": 1.7661568522453308, "w_mean": 1.2089157700538635, "w_min": 0.25, "w_std": 0.11429934948682785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3343.1146850585938, "cov_mean": -4.351784809841774e-06, "cov_std": 0.24954789131879807, "entropy": 0.38623046875, "epoch": 0.088, "grad_norm": 0.16160623729228973, "kl": 0.0005016326904296875, "learning_rate": 2.374037332934512e-07, "loss": -0.0028, "reward": 0.5416666977107525, "reward_std": 0.3281986638903618, "rewards/accuracy_reward": 0.15625001024454832, "rewards/format_reward": 0.3854166828095913, "step": 77, "w_high_ratio": 0.0, "w_low_ratio": 0.025709405075758696, "w_max": 1.4693324863910675, "w_mean": 1.1720669269561768, "w_min": 0.25, "w_std": 0.13362640514969826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3459.5625610351562, "cov_mean": 1.626165885681985e-05, "cov_std": 0.48687614500522614, "entropy": 0.37744140625, "epoch": 0.08914285714285715, "grad_norm": 0.24848157167434692, "kl": 0.0005955696105957031, "learning_rate": 2.2629708984760706e-07, "loss": -0.0075, "reward": 0.604166679084301, "reward_std": 0.6587165221571922, "rewards/accuracy_reward": 0.23958334140479565, "rewards/format_reward": 0.3645833395421505, "step": 78, "w_high_ratio": 0.04009601101279259, "w_low_ratio": 0.051254406571388245, "w_max": 1.6542562246322632, "w_mean": 1.174688458442688, "w_min": 0.0, "w_std": 0.27206049114465714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2496.4479370117188, "cov_mean": 4.12471272284165e-05, "cov_std": 0.26665735617280006, "entropy": 0.3173828125, "epoch": 0.09028571428571429, "grad_norm": 0.3005116581916809, "kl": 0.0013518333435058594, "learning_rate": 2.1558482853517253e-07, "loss": -0.0126, "reward": 0.8854166865348816, "reward_std": 0.3675435855984688, "rewards/accuracy_reward": 0.2604166744276881, "rewards/format_reward": 0.6250000149011612, "step": 79, "w_high_ratio": 0.10870501399040222, "w_low_ratio": 0.03222281183116138, "w_max": 1.7319224178791046, "w_mean": 1.2745553255081177, "w_min": 0.0, "w_std": 0.17969628423452377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3547.4063110351562, "cov_mean": -3.7483160667761695e-05, "cov_std": 0.2408691681921482, "entropy": 0.4462890625, "epoch": 0.09142857142857143, "grad_norm": 0.18163201212882996, "kl": 0.001146554946899414, "learning_rate": 2.0528000059645995e-07, "loss": 0.0002, "reward": 0.447916679084301, "reward_std": 0.4143633097410202, "rewards/accuracy_reward": 0.14583333674818277, "rewards/format_reward": 0.30208333395421505, "step": 80, "w_high_ratio": 0.0, "w_low_ratio": 0.028170655015856028, "w_max": 1.4094052910804749, "w_mean": 1.0949373543262482, "w_min": 0.25, "w_std": 0.12403910420835018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3345.6771850585938, "cov_mean": 5.422214962891303e-05, "cov_std": 0.2975510209798813, "entropy": 0.50634765625, "epoch": 0.09257142857142857, "grad_norm": 0.23768211901187897, "kl": 0.0026841163635253906, "learning_rate": 1.9539516087697517e-07, "loss": 0.0359, "reward": 0.4375, "reward_std": 0.29862353205680847, "rewards/accuracy_reward": 0.09375000558793545, "rewards/format_reward": 0.3437500149011612, "step": 81, "w_high_ratio": 0.0, "w_low_ratio": 0.039988940581679344, "w_max": 1.4104009568691254, "w_mean": 1.1443687975406647, "w_min": 0.25, "w_std": 0.15021733939647675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3128.5313110351562, "cov_mean": -4.930557446414241e-05, "cov_std": 0.1716586910188198, "entropy": 0.40576171875, "epoch": 0.09371428571428571, "grad_norm": 0.2130168080329895, "kl": 0.0011749267578125, "learning_rate": 1.8594235253127372e-07, "loss": -0.0095, "reward": 0.645833358168602, "reward_std": 0.28174424916505814, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.3958333358168602, "step": 82, "w_high_ratio": 0.0625, "w_low_ratio": 0.020149634685367346, "w_max": 1.7989980578422546, "w_mean": 1.2575880885124207, "w_min": 0.25, "w_std": 0.11121102049946785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3006.2188110351562, "cov_mean": -4.109572182642296e-05, "cov_std": 0.26048484444618225, "entropy": 0.41064453125, "epoch": 0.09485714285714286, "grad_norm": 0.342207670211792, "kl": 0.002028226852416992, "learning_rate": 1.7693309235023127e-07, "loss": -0.0346, "reward": 0.5625000149011612, "reward_std": 0.40875787287950516, "rewards/accuracy_reward": 0.19791667256504297, "rewards/format_reward": 0.3645833469927311, "step": 83, "w_high_ratio": 0.054204463958740234, "w_low_ratio": 0.02545588812790811, "w_max": 1.758713960647583, "w_mean": 1.165531873703003, "w_min": 1.1802768922825628e-26, "w_std": 0.13860907219350338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3249.6770935058594, "cov_mean": 1.03536285678274e-06, "cov_std": 0.3258565291762352, "entropy": 0.40576171875, "epoch": 0.096, "grad_norm": 0.3563970625400543, "kl": 0.0010137557983398438, "learning_rate": 1.6837835672960831e-07, "loss": -0.0069, "reward": 0.6979166865348816, "reward_std": 0.44399677217006683, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.4062500149011612, "step": 84, "w_high_ratio": 0.0561169758439064, "w_low_ratio": 0.032409061677753925, "w_max": 1.6246945261955261, "w_mean": 1.1986894607543945, "w_min": 0.25, "w_std": 0.20476746186614037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3284.145965576172, "cov_mean": -2.360550899993541e-05, "cov_std": 0.322089783847332, "entropy": 0.313232421875, "epoch": 0.09714285714285714, "grad_norm": 0.2012510597705841, "kl": 0.00048732757568359375, "learning_rate": 1.6028856829700258e-07, "loss": 0.022, "reward": 0.6041666939854622, "reward_std": 0.5810144916176796, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.4166666753590107, "step": 85, "w_high_ratio": 0.04291652888059616, "w_low_ratio": 0.044021588284522295, "w_max": 1.5220162570476532, "w_mean": 1.2036064565181732, "w_min": 0.0, "w_std": 0.2122751884162426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3284.5208740234375, "cov_mean": 7.192745897555142e-05, "cov_std": 0.2986150402575731, "entropy": 0.4326171875, "epoch": 0.09828571428571428, "grad_norm": 0.47124311327934265, "kl": 0.00186920166015625, "learning_rate": 1.5267358321348285e-07, "loss": -0.0661, "reward": 0.541666679084301, "reward_std": 0.26891910284757614, "rewards/accuracy_reward": 0.15625000279396772, "rewards/format_reward": 0.385416679084301, "step": 86, "w_high_ratio": 0.14706559106707573, "w_low_ratio": 0.02547268127091229, "w_max": 2.1021605730056763, "w_mean": 1.3188546895980835, "w_min": 0.25, "w_std": 0.20756208524107933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2947.7188110351562, "cov_mean": -3.2274874229187844e-05, "cov_std": 0.491459421813488, "entropy": 0.478515625, "epoch": 0.09942857142857142, "grad_norm": 0.4571603536605835, "kl": 0.00186920166015625, "learning_rate": 1.4554267916537495e-07, "loss": -0.0175, "reward": 0.7604167014360428, "reward_std": 0.5320771858096123, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.5729166939854622, "step": 87, "w_high_ratio": 0.09944025427103043, "w_low_ratio": 0.051899916026741266, "w_max": 2.0812322199344635, "w_mean": 1.3910081684589386, "w_min": 0.0, "w_std": 0.28863297775387764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2671.010528564453, "cov_mean": 8.175054426828865e-05, "cov_std": 0.5737064629793167, "entropy": 0.42578125, "epoch": 0.10057142857142858, "grad_norm": 0.8297140598297119, "kl": 0.006494998931884766, "learning_rate": 1.3890454406082956e-07, "loss": -0.0247, "reward": 1.052083358168602, "reward_std": 0.6991286426782608, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.6666666865348816, "step": 88, "w_high_ratio": 0.19003082811832428, "w_low_ratio": 0.0572223337367177, "w_max": 2.4647006690502167, "w_mean": 1.4986785650253296, "w_min": 0.0, "w_std": 0.39150019735097885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3554.2291870117188, "cov_mean": -3.470801129878964e-05, "cov_std": 0.33541250973939896, "entropy": 0.4052734375, "epoch": 0.10171428571428572, "grad_norm": 0.35713109374046326, "kl": 0.0019817352294921875, "learning_rate": 1.3276726544494571e-07, "loss": -0.0037, "reward": 0.5625000074505806, "reward_std": 0.5584763810038567, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.3333333358168602, "step": 89, "w_high_ratio": 0.0, "w_low_ratio": 0.046938784420490265, "w_max": 1.4760373830795288, "w_mean": 1.1144923567771912, "w_min": 4.203895392974451e-45, "w_std": 0.22363057732582092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2691.072998046875, "cov_mean": 3.220158714611898e-05, "cov_std": 0.29605602473020554, "entropy": 0.5078125, "epoch": 0.10285714285714286, "grad_norm": 0.3885078728199005, "kl": 0.0037512779235839844, "learning_rate": 1.2713832064634125e-07, "loss": 0.0576, "reward": 0.5833333432674408, "reward_std": 0.29030610620975494, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.5208333535119891, "step": 90, "w_high_ratio": 0.029189134016633034, "w_low_ratio": 0.02723024832084775, "w_max": 1.7544832229614258, "w_mean": 1.3031582236289978, "w_min": 0.25, "w_std": 0.1653207652270794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3429.2813110351562, "cov_mean": 4.58851766893531e-06, "cov_std": 0.255037359893322, "entropy": 0.44970703125, "epoch": 0.104, "grad_norm": 0.21901822090148926, "kl": 0.0016050338745117188, "learning_rate": 1.220245676671809e-07, "loss": 0.0086, "reward": 0.4791666679084301, "reward_std": 0.42312975227832794, "rewards/accuracy_reward": 0.14583334140479565, "rewards/format_reward": 0.33333334140479565, "step": 91, "w_high_ratio": 0.0, "w_low_ratio": 0.03593640075996518, "w_max": 1.3688502609729767, "w_mean": 1.1163062751293182, "w_min": 0.0, "w_std": 0.16501911543309689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2883.354217529297, "cov_mean": 0.00012286239780223696, "cov_std": 0.3810538575053215, "entropy": 0.35205078125, "epoch": 0.10514285714285715, "grad_norm": 0.3391018509864807, "kl": 0.0031414031982421875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0203, "reward": 0.6979167014360428, "reward_std": 0.44955648854374886, "rewards/accuracy_reward": 0.18750001024454832, "rewards/format_reward": 0.5104166716337204, "step": 92, "w_high_ratio": 0.04653368145227432, "w_low_ratio": 0.04205216746777296, "w_max": 1.9255772531032562, "w_mean": 1.2982309758663177, "w_min": 1.1786321383436036e-41, "w_std": 0.2336835414171219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3868.2708740234375, "cov_mean": 6.7976218360854546e-06, "cov_std": 0.18175788596272469, "entropy": 0.52001953125, "epoch": 0.10628571428571429, "grad_norm": 0.20475780963897705, "kl": 0.0019414424896240234, "learning_rate": 1.1336692317580158e-07, "loss": 0.0439, "reward": 0.07291666883975267, "reward_std": 0.16979892551898956, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.06250000186264515, "step": 93, "w_high_ratio": 0.0, "w_low_ratio": 0.026896574534475803, "w_max": 1.2091633975505829, "w_mean": 1.0145649313926697, "w_min": 0.25, "w_std": 0.10182364657521248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3365.0313720703125, "cov_mean": 5.4338164773071185e-05, "cov_std": 0.3005314916372299, "entropy": 0.498046875, "epoch": 0.10742857142857143, "grad_norm": 0.2650594711303711, "kl": 0.0022263526916503906, "learning_rate": 1.0983357966978745e-07, "loss": -0.0027, "reward": 0.40625001303851604, "reward_std": 0.3088777959346771, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.27083333395421505, "step": 94, "w_high_ratio": 0.0, "w_low_ratio": 0.03483147523365915, "w_max": 1.5337306559085846, "w_mean": 1.0965670943260193, "w_min": 3.503246160812043e-46, "w_std": 0.16744443587958813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3801.90625, "cov_mean": 3.249865312682232e-05, "cov_std": 0.22550634294748306, "entropy": 0.39990234375, "epoch": 0.10857142857142857, "grad_norm": 0.1532547026872635, "kl": 0.00028705596923828125, "learning_rate": 1.068365111445064e-07, "loss": 0.0152, "reward": 0.2395833395421505, "reward_std": 0.36024561524391174, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.16666666977107525, "step": 95, "w_high_ratio": 0.0, "w_low_ratio": 0.027520990930497646, "w_max": 1.2406704127788544, "w_mean": 1.0429764091968536, "w_min": 0.25, "w_std": 0.12879234366118908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2939.229248046875, "cov_mean": -0.00012694882570940536, "cov_std": 0.42605962604284286, "entropy": 0.35693359375, "epoch": 0.10971428571428571, "grad_norm": 0.46688124537467957, "kl": 0.0020439624786376953, "learning_rate": 1.0437936906629334e-07, "loss": -0.018, "reward": 0.7500000298023224, "reward_std": 0.5337796807289124, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.4895833432674408, "step": 96, "w_high_ratio": 0.09955519065260887, "w_low_ratio": 0.04101596772670746, "w_max": 2.1674250662326813, "w_mean": 1.3317046463489532, "w_min": 0.0, "w_std": 0.2793598398566246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3480.8334350585938, "cov_mean": -2.861599477910204e-05, "cov_std": 0.2684932127594948, "entropy": 0.400390625, "epoch": 0.11085714285714286, "grad_norm": 0.336283415555954, "kl": 0.0008549690246582031, "learning_rate": 1.0246514708427701e-07, "loss": -0.0136, "reward": 0.5416666716337204, "reward_std": 0.3185878023505211, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.3229166716337204, "step": 97, "w_high_ratio": 0.0, "w_low_ratio": 0.03224400524049997, "w_max": 1.3480738699436188, "w_mean": 1.129571795463562, "w_min": 0.25, "w_std": 0.1670757606625557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3187.0418090820312, "cov_mean": -3.800686135946307e-05, "cov_std": 0.3548770062625408, "entropy": 0.37158203125, "epoch": 0.112, "grad_norm": 0.4635946452617645, "kl": 0.0006475448608398438, "learning_rate": 1.0109617738307911e-07, "loss": -0.0451, "reward": 0.5312500149011612, "reward_std": 0.2919851318001747, "rewards/accuracy_reward": 0.13541666977107525, "rewards/format_reward": 0.3958333432674408, "step": 98, "w_high_ratio": 0.08355391025543213, "w_low_ratio": 0.03265182231552899, "w_max": 1.7274979948997498, "w_mean": 1.2217411994934082, "w_min": 0.0, "w_std": 0.2073941007256508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 3087.635467529297, "cov_mean": 6.88847137553239e-06, "cov_std": 0.1545543149113655, "entropy": 0.34375, "epoch": 0.11314285714285714, "grad_norm": 0.336643785238266, "kl": 0.001453399658203125, "learning_rate": 1.002741278414069e-07, "loss": 0.0319, "reward": 0.5000000204890966, "reward_std": 0.22891659289598465, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.3020833348855376, "step": 99, "w_high_ratio": 0.06669171899557114, "w_low_ratio": 0.019805304240435362, "w_max": 1.3878345787525177, "w_mean": 1.1877047568559647, "w_min": 0.25, "w_std": 0.10978816263377666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 2950.5938110351562, "cov_mean": 4.911199903290253e-05, "cov_std": 0.4131714701652527, "entropy": 0.34521484375, "epoch": 0.11428571428571428, "grad_norm": 0.6010158061981201, "kl": 0.0022754669189453125, "learning_rate": 1e-07, "loss": -0.0995, "reward": 0.8750000223517418, "reward_std": 0.5578841716051102, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.5208333507180214, "step": 100, "w_high_ratio": 0.19728600606322289, "w_low_ratio": 0.03664776147343218, "w_max": 2.3017463386058807, "w_mean": 1.389022558927536, "w_min": 4.2088412759042857e-39, "w_std": 0.2764138747006655 }, { "epoch": 0.11428571428571428, "step": 100, "total_flos": 0.0, "train_loss": -0.0008640377339906991, "train_runtime": 8354.3803, "train_samples_per_second": 1.149, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }