{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.11428571428571428, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2700.4271850585938, "cov_mean": -2.6832926778297406e-05, "cov_std": 0.24635104648768902, "entropy": 0.36865234375, "epoch": 0.001142857142857143, "grad_norm": 0.35615867376327515, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0696, "reward": 0.7604166893288493, "reward_std": 0.4268697127699852, "rewards/accuracy_reward": 0.25000001303851604, "rewards/format_reward": 0.5104166669771075, "step": 1, "w_high_ratio": 0.2208261415362358, "w_low_ratio": 0.027151118498295546, "w_max": 2.1915207505226135, "w_mean": 1.4711343348026276, "w_min": 1.404075949984986e-37, "w_std": 0.24041971936821938 }, { "completion_length": 3127.3958435058594, "cov_mean": -1.8215427189716138e-05, "cov_std": 0.18336841650307178, "entropy": 0.353515625, "epoch": 0.002285714285714286, "grad_norm": 0.18010225892066956, "kl": 0.0, "learning_rate": 1e-07, "loss": 0.0533, "reward": 0.6458333637565374, "reward_std": 0.4249730706214905, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.3645833386108279, "step": 2, "w_high_ratio": 0.05701034888625145, "w_low_ratio": 0.023528859252110124, "w_max": 1.811183512210846, "w_mean": 1.2113382518291473, "w_min": 0.0, "w_std": 0.15613791532814503 }, { "completion_length": 3691.0626220703125, "cov_mean": 2.796226033296989e-05, "cov_std": 0.1637928392738104, "entropy": 0.44189453125, "epoch": 0.0034285714285714284, "grad_norm": 0.1356951743364334, "kl": 3.916025161743164e-05, "learning_rate": 2e-07, "loss": 0.052, "reward": 0.19791667256504297, "reward_std": 0.3607826754450798, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.14583333674818277, "step": 3, "w_high_ratio": 0.0, "w_low_ratio": 0.02235229848884046, "w_max": 1.460817277431488, "w_mean": 1.082369714975357, "w_min": 3.3280838527714405e-44, "w_std": 0.12309953197836876 }, { "completion_length": 2353.2709350585938, "cov_mean": 1.0425418167869793e-05, "cov_std": 0.3036706894636154, "entropy": 0.41259765625, "epoch": 0.004571428571428572, "grad_norm": 0.190170019865036, "kl": 3.3348798751831055e-05, "learning_rate": 3e-07, "loss": 0.0459, "reward": 0.8750000149011612, "reward_std": 0.5107106417417526, "rewards/accuracy_reward": 0.1875000037252903, "rewards/format_reward": 0.6875000298023224, "step": 4, "w_high_ratio": 0.2652290016412735, "w_low_ratio": 0.034206886775791645, "w_max": 2.106997400522232, "w_mean": 1.5420070886611938, "w_min": 2.4617042843759845e-36, "w_std": 0.2812090367078781 }, { "completion_length": 3485.1771850585938, "cov_mean": 3.2382055223934003e-06, "cov_std": 0.29665667191147804, "entropy": 0.4609375, "epoch": 0.005714285714285714, "grad_norm": 0.2197088897228241, "kl": 4.2125582695007324e-05, "learning_rate": 4e-07, "loss": 0.0803, "reward": 0.46875001303851604, "reward_std": 0.5515270829200745, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.35416667722165585, "step": 5, "w_high_ratio": 0.008333034813404083, "w_low_ratio": 0.04545952333137393, "w_max": 1.5202394425868988, "w_mean": 1.1503158807754517, "w_min": 5.693325166185118e-29, "w_std": 0.23378031328320503 }, { "completion_length": 3451.2500610351562, "cov_mean": -4.464495305001037e-05, "cov_std": 0.236886378377676, "entropy": 0.46142578125, "epoch": 0.006857142857142857, "grad_norm": 0.13218647241592407, "kl": 4.482269287109375e-05, "learning_rate": 5e-07, "loss": 0.0517, "reward": 0.3645833507180214, "reward_std": 0.515114888548851, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.28125000186264515, "step": 6, "w_high_ratio": 0.0625, "w_low_ratio": 0.031897591426968575, "w_max": 1.5891262888908386, "w_mean": 1.1359511613845825, "w_min": 0.0, "w_std": 0.14766533859074116 }, { "completion_length": 3224.3125610351562, "cov_mean": 4.349886694399174e-06, "cov_std": 0.3991788253188133, "entropy": 0.38671875, "epoch": 0.008, "grad_norm": 0.22412240505218506, "kl": 2.1651387214660645e-05, "learning_rate": 6e-07, "loss": 0.0742, "reward": 0.8541666865348816, "reward_std": 0.6870906725525856, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.6250000149011612, "step": 7, "w_high_ratio": 0.047733694314956665, "w_low_ratio": 0.053672163281589746, "w_max": 1.5988431572914124, "w_mean": 1.2651265263557434, "w_min": 6.1929912552473734e-37, "w_std": 0.2889493927359581 }, { "completion_length": 2800.9583740234375, "cov_mean": 1.0622998161124997e-06, "cov_std": 0.15430260822176933, "entropy": 0.33740234375, "epoch": 0.009142857142857144, "grad_norm": 0.11328813433647156, "kl": 1.7002224922180176e-05, "learning_rate": 7e-07, "loss": 0.0184, "reward": 0.8958333730697632, "reward_std": 0.25296592339873314, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.4895833432674408, "step": 8, "w_high_ratio": 0.0, "w_low_ratio": 0.01592865912243724, "w_max": 1.5333127677440643, "w_mean": 1.2431240677833557, "w_min": 0.25, "w_std": 0.11287659406661987 }, { "completion_length": 3369.791748046875, "cov_mean": -2.2509159407491097e-05, "cov_std": 0.20683829113841057, "entropy": 0.45263671875, "epoch": 0.010285714285714285, "grad_norm": 0.1632954180240631, "kl": 4.3064355850219727e-05, "learning_rate": 8e-07, "loss": 0.0417, "reward": 0.4583333507180214, "reward_std": 0.3903508894145489, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.3437500074505806, "step": 9, "w_high_ratio": 0.0, "w_low_ratio": 0.029203591868281364, "w_max": 1.6281995177268982, "w_mean": 1.1540252268314362, "w_min": 1.9273542721946577e-23, "w_std": 0.15844954177737236 }, { "completion_length": 2794.2291870117188, "cov_mean": 7.258828873091261e-06, "cov_std": 0.22622444108128548, "entropy": 0.34716796875, "epoch": 0.011428571428571429, "grad_norm": 0.11491074413061142, "kl": 2.664327621459961e-05, "learning_rate": 9e-07, "loss": 0.0765, "reward": 0.6145833432674408, "reward_std": 0.4795500487089157, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.4479166828095913, "step": 10, "w_high_ratio": 0.1683393381536007, "w_low_ratio": 0.03143396740779281, "w_max": 1.8651617467403412, "w_mean": 1.2822044789791107, "w_min": 2.2624703592113335e-38, "w_std": 0.20682579837739468 }, { "completion_length": 3703.197998046875, "cov_mean": -2.311449361513951e-05, "cov_std": 0.18362887762486935, "entropy": 0.39697265625, "epoch": 0.012571428571428572, "grad_norm": 0.12180526554584503, "kl": 2.73287296295166e-05, "learning_rate": 1e-06, "loss": 0.0401, "reward": 0.26041667722165585, "reward_std": 0.3744332268834114, "rewards/accuracy_reward": 0.10416667256504297, "rewards/format_reward": 0.15625000558793545, "step": 11, "w_high_ratio": 0.027614232152700424, "w_low_ratio": 0.024976021610200405, "w_max": 1.3482708036899567, "w_mean": 1.0835402309894562, "w_min": 0.25, "w_std": 0.12668619584292173 }, { "completion_length": 2611.260498046875, "cov_mean": 1.0368909215685562e-05, "cov_std": 0.22750693373382092, "entropy": 0.3984375, "epoch": 0.013714285714285714, "grad_norm": 0.1790972799062729, "kl": 2.802908420562744e-05, "learning_rate": 9.997258721585931e-07, "loss": 0.0576, "reward": 0.7395833805203438, "reward_std": 0.4462515264749527, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.604166679084301, "step": 12, "w_high_ratio": 0.10206323117017746, "w_low_ratio": 0.0320228124037385, "w_max": 2.1756480634212494, "w_mean": 1.4741427898406982, "w_min": 4.420129648185005e-23, "w_std": 0.23598888516426086 }, { "completion_length": 3224.041748046875, "cov_mean": -2.2661331968265586e-05, "cov_std": 0.15935274586081505, "entropy": 0.38427734375, "epoch": 0.014857142857142857, "grad_norm": 0.24569930136203766, "kl": 1.7702579498291016e-05, "learning_rate": 9.989038226169207e-07, "loss": 0.0498, "reward": 0.604166679084301, "reward_std": 0.30622391402721405, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.385416679084301, "step": 13, "w_high_ratio": 0.20555464923381805, "w_low_ratio": 0.01726952870376408, "w_max": 1.8901410400867462, "w_mean": 1.31855970621109, "w_min": 1.1411503146395655e-35, "w_std": 0.14450976066291332 }, { "completion_length": 3125.197998046875, "cov_mean": -2.9986793833813863e-05, "cov_std": 0.18112273141741753, "entropy": 0.3623046875, "epoch": 0.016, "grad_norm": 0.14652569591999054, "kl": 7.249414920806885e-06, "learning_rate": 9.975348529157229e-07, "loss": 0.0596, "reward": 0.5000000111758709, "reward_std": 0.3975026085972786, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.3645833432674408, "step": 14, "w_high_ratio": 0.0, "w_low_ratio": 0.026353970635682344, "w_max": 1.6898008584976196, "w_mean": 1.1696374714374542, "w_min": 2.786338774190119e-34, "w_std": 0.17131789773702621 }, { "completion_length": 2945.3959350585938, "cov_mean": 3.4355500702076824e-06, "cov_std": 0.18097041826695204, "entropy": 0.37109375, "epoch": 0.017142857142857144, "grad_norm": 0.08377102017402649, "kl": 2.804398536682129e-05, "learning_rate": 9.956206309337066e-07, "loss": 0.0282, "reward": 0.6875000149011612, "reward_std": 0.37770550325512886, "rewards/accuracy_reward": 0.2395833395421505, "rewards/format_reward": 0.4479166716337204, "step": 15, "w_high_ratio": 0.0, "w_low_ratio": 0.027025693794712424, "w_max": 1.4225987792015076, "w_mean": 1.1180275976657867, "w_min": 8.951121255272305e-16, "w_std": 0.14841708727180958 }, { "completion_length": 3842.3646240234375, "cov_mean": -3.1302830393542536e-05, "cov_std": 0.16068686172366142, "entropy": 0.458984375, "epoch": 0.018285714285714287, "grad_norm": 0.1221930980682373, "kl": 2.4199485778808594e-05, "learning_rate": 9.931634888554935e-07, "loss": 0.0431, "reward": 0.1562500037252903, "reward_std": 0.3155686669051647, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.10416666977107525, "step": 16, "w_high_ratio": 0.0, "w_low_ratio": 0.022678226232528687, "w_max": 1.1968038976192474, "w_mean": 1.0266980826854706, "w_min": 0.25, "w_std": 0.10604305937886238 }, { "completion_length": 2433.1875915527344, "cov_mean": 2.4951528757810593e-05, "cov_std": 0.27749199233949184, "entropy": 0.44970703125, "epoch": 0.019428571428571427, "grad_norm": 0.13208113610744476, "kl": 5.91278076171875e-05, "learning_rate": 9.901664203302124e-07, "loss": -0.0048, "reward": 0.8854166865348816, "reward_std": 0.4504813477396965, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.6145833358168602, "step": 17, "w_high_ratio": 0.171035997569561, "w_low_ratio": 0.03585993289016187, "w_max": 2.199991285800934, "w_mean": 1.4317797720432281, "w_min": 0.25, "w_std": 0.24799126759171486 }, { "completion_length": 3167.4791870117188, "cov_mean": -2.742706919889315e-05, "cov_std": 0.2498251087963581, "entropy": 0.369140625, "epoch": 0.02057142857142857, "grad_norm": 0.15079385042190552, "kl": 2.0952895283699036e-05, "learning_rate": 9.866330768241983e-07, "loss": 0.0577, "reward": 0.5729166939854622, "reward_std": 0.5097959190607071, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.4062500149011612, "step": 18, "w_high_ratio": 0.0, "w_low_ratio": 0.034274401143193245, "w_max": 1.457490622997284, "w_mean": 1.1403506994247437, "w_min": 0.0, "w_std": 0.16276290826499462 }, { "completion_length": 3139.635498046875, "cov_mean": 1.529891596874222e-05, "cov_std": 0.13174043968319893, "entropy": 0.39208984375, "epoch": 0.021714285714285714, "grad_norm": 0.08939936012029648, "kl": 5.739927291870117e-05, "learning_rate": 9.825677631722435e-07, "loss": 0.0233, "reward": 0.8541666828095913, "reward_std": 0.3213166669011116, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.47916666977107525, "step": 19, "w_high_ratio": 0.035190850496292114, "w_low_ratio": 0.016721592284739017, "w_max": 1.647162914276123, "w_mean": 1.2506683766841888, "w_min": 0.25, "w_std": 0.09711403585970402 }, { "completion_length": 2464.385498046875, "cov_mean": 3.157450896651426e-05, "cov_std": 0.283736914396286, "entropy": 0.3369140625, "epoch": 0.022857142857142857, "grad_norm": 0.3488742411136627, "kl": 9.429454803466797e-05, "learning_rate": 9.779754323328192e-07, "loss": 0.0914, "reward": 0.9375000596046448, "reward_std": 0.4943716749548912, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.6770833656191826, "step": 20, "w_high_ratio": 0.0566110759973526, "w_low_ratio": 0.031579687260091305, "w_max": 2.3184494078159332, "w_mean": 1.4481623768806458, "w_min": 0.0, "w_std": 0.28026906587183475 }, { "completion_length": 2847.21875, "cov_mean": -2.243259518763807e-05, "cov_std": 0.18684318475425243, "entropy": 0.423828125, "epoch": 0.024, "grad_norm": 0.13671471178531647, "kl": 0.00033351778984069824, "learning_rate": 9.728616793536587e-07, "loss": 0.0498, "reward": 0.6562500204890966, "reward_std": 0.38981083035469055, "rewards/accuracy_reward": 0.18750000465661287, "rewards/format_reward": 0.4687500027939677, "step": 21, "w_high_ratio": 0.08715118188410997, "w_low_ratio": 0.021108672255650163, "w_max": 1.9608261287212372, "w_mean": 1.387522131204605, "w_min": 4.576730842832761e-23, "w_std": 0.14338573440909386 }, { "completion_length": 1849.3542175292969, "cov_mean": -5.1019123930018395e-05, "cov_std": 0.208794716745615, "entropy": 0.3994140625, "epoch": 0.025142857142857144, "grad_norm": 0.1823095828294754, "kl": 0.00039577484130859375, "learning_rate": 9.672327345550543e-07, "loss": 0.0503, "reward": 1.1041666716337204, "reward_std": 0.363413717597723, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.833333358168602, "step": 22, "w_high_ratio": 0.3683718554675579, "w_low_ratio": 0.028410385129973292, "w_max": 2.5447845458984375, "w_mean": 1.7170847058296204, "w_min": 1.7296456515038733e-32, "w_std": 0.18113290891051292 }, { "completion_length": 2786.604217529297, "cov_mean": 6.156731569717522e-05, "cov_std": 0.21581846103072166, "entropy": 0.3828125, "epoch": 0.026285714285714287, "grad_norm": 0.10202132165431976, "kl": 0.00020551681518554688, "learning_rate": 9.610954559391704e-07, "loss": 0.0436, "reward": 0.708333358168602, "reward_std": 0.47307053953409195, "rewards/accuracy_reward": 0.22916667722165585, "rewards/format_reward": 0.4791666716337204, "step": 23, "w_high_ratio": 0.05155515298247337, "w_low_ratio": 0.03038623696193099, "w_max": 1.8245242238044739, "w_mean": 1.2686880826950073, "w_min": 2.0108632963061125e-43, "w_std": 0.19650068879127502 }, { "completion_length": 2932.14599609375, "cov_mean": 3.685860542645969e-05, "cov_std": 0.19567562174052, "entropy": 0.35986328125, "epoch": 0.027428571428571427, "grad_norm": 0.11536505818367004, "kl": 0.00012372806668281555, "learning_rate": 9.54457320834625e-07, "loss": 0.0259, "reward": 0.8229167014360428, "reward_std": 0.41623104363679886, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.5520833507180214, "step": 24, "w_high_ratio": 0.05616182088851929, "w_low_ratio": 0.025760386954061687, "w_max": 1.7301380336284637, "w_mean": 1.2399356663227081, "w_min": 0.25, "w_std": 0.16614723671227694 }, { "completion_length": 2980.104248046875, "cov_mean": 5.5617931593587855e-06, "cov_std": 0.20383853651583195, "entropy": 0.43408203125, "epoch": 0.02857142857142857, "grad_norm": 0.13895268738269806, "kl": 0.0003798753023147583, "learning_rate": 9.473264167865171e-07, "loss": 0.0328, "reward": 0.6145833609625697, "reward_std": 0.4326799139380455, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.4062500102445483, "step": 25, "w_high_ratio": 0.15471260249614716, "w_low_ratio": 0.027910931850783527, "w_max": 1.878886878490448, "w_mean": 1.3457823991775513, "w_min": 2.1938871947043534e-19, "w_std": 0.21430648770183325 }, { "completion_length": 3121.822998046875, "cov_mean": -1.5570902263561948e-05, "cov_std": 0.12479476444423199, "entropy": 0.41845703125, "epoch": 0.029714285714285714, "grad_norm": 0.055989839136600494, "kl": 5.704164505004883e-05, "learning_rate": 9.397114317029974e-07, "loss": 0.006, "reward": 0.7395833656191826, "reward_std": 0.2778088226914406, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.4895833358168602, "step": 26, "w_high_ratio": 0.05233287438750267, "w_low_ratio": 0.016803464153781533, "w_max": 1.667470008134842, "w_mean": 1.2069356143474579, "w_min": 1.8282741064045888e-40, "w_std": 0.12309898342937231 }, { "completion_length": 3419.697998046875, "cov_mean": -8.16069814391085e-06, "cov_std": 0.2326441928744316, "entropy": 0.45654296875, "epoch": 0.030857142857142857, "grad_norm": 0.1154065951704979, "kl": 0.0001367814838886261, "learning_rate": 9.316216432703916e-07, "loss": 0.0629, "reward": 0.4687500223517418, "reward_std": 0.4522514268755913, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.3541666716337204, "step": 27, "w_high_ratio": 0.125, "w_low_ratio": 0.03899317281320691, "w_max": 1.7131148278713226, "w_mean": 1.2755843102931976, "w_min": 2.3244726053753363e-31, "w_std": 0.15411211177706718 }, { "completion_length": 3003.3334350585938, "cov_mean": 3.4548415897006635e-06, "cov_std": 0.18210824206471443, "entropy": 0.40576171875, "epoch": 0.032, "grad_norm": 0.10329318046569824, "kl": 0.00033906102180480957, "learning_rate": 9.230669076497687e-07, "loss": 0.0466, "reward": 0.729166679084301, "reward_std": 0.4190382733941078, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.4375000149011612, "step": 28, "w_high_ratio": 0.045407865196466446, "w_low_ratio": 0.020685997209511697, "w_max": 1.9191896319389343, "w_mean": 1.28102046251297, "w_min": 1.1079171471645686e-36, "w_std": 0.15209556370973587 }, { "completion_length": 3622.8438110351562, "cov_mean": -1.4215014289220562e-05, "cov_std": 0.19959762692451477, "entropy": 0.43701171875, "epoch": 0.03314285714285714, "grad_norm": 0.11793094128370285, "kl": 0.00043398141860961914, "learning_rate": 9.140576474687263e-07, "loss": 0.0686, "reward": 0.2812500149011612, "reward_std": 0.343124657869339, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.2187500074505806, "step": 29, "w_high_ratio": 0.0, "w_low_ratio": 0.03135715611279011, "w_max": 1.389756977558136, "w_mean": 1.1044960916042328, "w_min": 0.5, "w_std": 0.15529824048280716 }, { "completion_length": 3211.6563110351562, "cov_mean": 2.0011442074974184e-05, "cov_std": 0.3438211902976036, "entropy": 0.40087890625, "epoch": 0.03428571428571429, "grad_norm": 0.15425726771354675, "kl": 0.0005748271942138672, "learning_rate": 9.046048391230247e-07, "loss": 0.096, "reward": 0.7708333544433117, "reward_std": 0.6762835085391998, "rewards/accuracy_reward": 0.2812500139698386, "rewards/format_reward": 0.489583358168602, "step": 30, "w_high_ratio": 0.0955454632639885, "w_low_ratio": 0.04459251323714852, "w_max": 1.7270594835281372, "w_mean": 1.2695180475711823, "w_min": 0.0, "w_std": 0.22970640659332275 }, { "completion_length": 3313.7500610351562, "cov_mean": -6.734976523148362e-07, "cov_std": 0.1561581064015627, "entropy": 0.38623046875, "epoch": 0.03542857142857143, "grad_norm": 0.12495917081832886, "kl": 0.00024247169494628906, "learning_rate": 8.9471999940354e-07, "loss": 0.0659, "reward": 0.47916669212281704, "reward_std": 0.3414399288594723, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.31250001303851604, "step": 31, "w_high_ratio": 0.12074629962444305, "w_low_ratio": 0.02205055020749569, "w_max": 1.9694485068321228, "w_mean": 1.323824942111969, "w_min": 3.479372530225627e-30, "w_std": 0.15388164669275284 }, { "completion_length": 3430.5521850585938, "cov_mean": -1.8880080915550934e-05, "cov_std": 0.24603740125894547, "entropy": 0.4443359375, "epoch": 0.036571428571428574, "grad_norm": 0.10446158051490784, "kl": 0.00040030479431152344, "learning_rate": 8.844151714648274e-07, "loss": 0.0417, "reward": 0.6875000223517418, "reward_std": 0.4970519095659256, "rewards/accuracy_reward": 0.2604166679084301, "rewards/format_reward": 0.4270833507180214, "step": 32, "w_high_ratio": 0.0, "w_low_ratio": 0.03472677152603865, "w_max": 1.578925609588623, "w_mean": 1.1632550954818726, "w_min": 0.0, "w_std": 0.17994992434978485 }, { "completion_length": 3569.229248046875, "cov_mean": -1.2864127711509354e-05, "cov_std": 0.21655914932489395, "entropy": 0.3828125, "epoch": 0.037714285714285714, "grad_norm": 0.11952047049999237, "kl": 0.00048720836639404297, "learning_rate": 8.737029101523929e-07, "loss": 0.0213, "reward": 0.5833333507180214, "reward_std": 0.4569981172680855, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.3541666716337204, "step": 33, "w_high_ratio": 0.0, "w_low_ratio": 0.03175507392734289, "w_max": 1.3123357892036438, "w_mean": 1.0974721312522888, "w_min": 0.25, "w_std": 0.16161495074629784 }, { "completion_length": 2714.0000610351562, "cov_mean": -3.014505455212202e-05, "cov_std": 0.24434123933315277, "entropy": 0.462890625, "epoch": 0.038857142857142854, "grad_norm": 0.24054297804832458, "kl": 0.0010285377502441406, "learning_rate": 8.625962667065487e-07, "loss": 0.0264, "reward": 0.8541666865348816, "reward_std": 0.43565599620342255, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.5208333432674408, "step": 34, "w_high_ratio": 0.08968023210763931, "w_low_ratio": 0.03058682754635811, "w_max": 1.8369105458259583, "w_mean": 1.3255797028541565, "w_min": 0.25, "w_std": 0.23013706505298615 }, { "completion_length": 3206.260498046875, "cov_mean": 1.827982691793295e-05, "cov_std": 0.2361072190105915, "entropy": 0.42578125, "epoch": 0.04, "grad_norm": 0.13136403262615204, "kl": 0.0009332895278930664, "learning_rate": 8.511087728614862e-07, "loss": 0.0513, "reward": 0.5625000149011612, "reward_std": 0.47499874979257584, "rewards/accuracy_reward": 0.1979166679084301, "rewards/format_reward": 0.3645833469927311, "step": 35, "w_high_ratio": 0.0, "w_low_ratio": 0.03539817640557885, "w_max": 1.4059478044509888, "w_mean": 1.1172049045562744, "w_min": 0.0, "w_std": 0.1749916821718216 }, { "completion_length": 3699.416748046875, "cov_mean": -1.4997711559772142e-05, "cov_std": 0.20064959302544594, "entropy": 0.50439453125, "epoch": 0.04114285714285714, "grad_norm": 0.1509845107793808, "kl": 0.0011619925498962402, "learning_rate": 8.392544243589427e-07, "loss": 0.0429, "reward": 0.2500000102445483, "reward_std": 0.41391417384147644, "rewards/accuracy_reward": 0.031250000931322575, "rewards/format_reward": 0.21875000279396772, "step": 36, "w_high_ratio": 0.0, "w_low_ratio": 0.029611330712214112, "w_max": 1.3176401853561401, "w_mean": 1.0739335417747498, "w_min": 0.0, "w_std": 0.15683909878134727 }, { "completion_length": 3516.1563110351562, "cov_mean": -2.547230405980372e-05, "cov_std": 0.11416707932949066, "entropy": 0.43994140625, "epoch": 0.04228571428571429, "grad_norm": 0.08760611712932587, "kl": 0.0007746219635009766, "learning_rate": 8.270476638965461e-07, "loss": 0.0156, "reward": 0.22916667442768812, "reward_std": 0.19299374520778656, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.21875001024454832, "step": 37, "w_high_ratio": 0.05747595056891441, "w_low_ratio": 0.013685875572264194, "w_max": 1.6239450573921204, "w_mean": 1.1775790452957153, "w_min": 0.25, "w_std": 0.12097344920039177 }, { "completion_length": 3670.822998046875, "cov_mean": -4.859739419771358e-06, "cov_std": 0.11942135915160179, "entropy": 0.4833984375, "epoch": 0.04342857142857143, "grad_norm": 0.08295677602291107, "kl": 0.0007152557373046875, "learning_rate": 8.145033635316128e-07, "loss": 0.016, "reward": 0.322916679084301, "reward_std": 0.24508872628211975, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.1770833432674408, "step": 38, "w_high_ratio": 0.0, "w_low_ratio": 0.015665842220187187, "w_max": 1.2278587818145752, "w_mean": 1.0692134499549866, "w_min": 0.5, "w_std": 0.08390428125858307 }, { "completion_length": 3133.5521850585938, "cov_mean": 1.6014040738809854e-05, "cov_std": 0.15454116463661194, "entropy": 0.38427734375, "epoch": 0.044571428571428574, "grad_norm": 0.09236446022987366, "kl": 0.0011830329895019531, "learning_rate": 8.01636806561836e-07, "loss": 0.0191, "reward": 0.770833358168602, "reward_std": 0.30482664704322815, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.520833358168602, "step": 39, "w_high_ratio": 0.0, "w_low_ratio": 0.021826621610671282, "w_max": 1.4440618753433228, "w_mean": 1.1500347554683685, "w_min": 0.25, "w_std": 0.10476426035165787 }, { "completion_length": 2921.5938110351562, "cov_mean": 3.307407860120293e-05, "cov_std": 0.18591826409101486, "entropy": 0.4111328125, "epoch": 0.045714285714285714, "grad_norm": 0.1281704157590866, "kl": 0.0041351318359375, "learning_rate": 7.884636689049422e-07, "loss": 0.0448, "reward": 0.6770833507180214, "reward_std": 0.39515648037195206, "rewards/accuracy_reward": 0.17708333395421505, "rewards/format_reward": 0.5000000149011612, "step": 40, "w_high_ratio": 0.09708013385534286, "w_low_ratio": 0.02820506482385099, "w_max": 1.9662592709064484, "w_mean": 1.3416504263877869, "w_min": 1.7285604841107016e-17, "w_std": 0.17729274183511734 }, { "completion_length": 3497.4063110351562, "cov_mean": -7.282104343175888e-05, "cov_std": 0.2843910865485668, "entropy": 0.40771484375, "epoch": 0.046857142857142854, "grad_norm": 0.16035234928131104, "kl": 0.0008625984191894531, "learning_rate": 7.75e-07, "loss": 0.0447, "reward": 0.4583333507180214, "reward_std": 0.5345464050769806, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.3333333507180214, "step": 41, "w_high_ratio": 0.015127741731703281, "w_low_ratio": 0.04166511259973049, "w_max": 1.6356081068515778, "w_mean": 1.1487390100955963, "w_min": 3.531651517161998e-24, "w_std": 0.21111097559332848 }, { "completion_length": 3070.854248046875, "cov_mean": 4.5878337004978675e-06, "cov_std": 0.0855317497625947, "entropy": 0.48388671875, "epoch": 0.048, "grad_norm": 0.06175260245800018, "kl": 0.0006914138793945312, "learning_rate": 7.612622032536507e-07, "loss": 0.0063, "reward": 0.3229166716337204, "reward_std": 0.17353228479623795, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.3125000074505806, "step": 42, "w_high_ratio": 0.05438845232129097, "w_low_ratio": 0.008411283954046667, "w_max": 1.544800043106079, "w_mean": 1.1694203615188599, "w_min": 0.5, "w_std": 0.07276808470487595 }, { "completion_length": 3378.8125610351562, "cov_mean": -2.5848277346085524e-05, "cov_std": 0.2625325694680214, "entropy": 0.43701171875, "epoch": 0.04914285714285714, "grad_norm": 0.18838584423065186, "kl": 0.0014505386352539062, "learning_rate": 7.472670160550848e-07, "loss": 0.0805, "reward": 0.479166679084301, "reward_std": 0.5196144729852676, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.2916666716337204, "step": 43, "w_high_ratio": 0.05749715492129326, "w_low_ratio": 0.038668573601171374, "w_max": 1.9480818212032318, "w_mean": 1.1805387139320374, "w_min": 2.3359345679368763e-34, "w_std": 0.1971494909375906 }, { "completion_length": 2916.9791717529297, "cov_mean": -3.640100658230949e-06, "cov_std": 0.23978274501860142, "entropy": 0.41162109375, "epoch": 0.05028571428571429, "grad_norm": 0.16967085003852844, "kl": 0.005632162094116211, "learning_rate": 7.330314893841101e-07, "loss": 0.0373, "reward": 0.7708333460614085, "reward_std": 0.43626825511455536, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.4895833386108279, "step": 44, "w_high_ratio": 0.11527429521083832, "w_low_ratio": 0.0326957437209785, "w_max": 1.8800793588161469, "w_mean": 1.356241375207901, "w_min": 0.0, "w_std": 0.22198213264346123 }, { "completion_length": 3686.1875610351562, "cov_mean": -1.2132580195611808e-05, "cov_std": 0.17040352895855904, "entropy": 0.4228515625, "epoch": 0.05142857142857143, "grad_norm": 0.0952582135796547, "kl": 0.0019674301147460938, "learning_rate": 7.185729670371604e-07, "loss": 0.0175, "reward": 0.4687500149011612, "reward_std": 0.3746139518916607, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.2500000074505806, "step": 45, "w_high_ratio": 0.0, "w_low_ratio": 0.022616846952587366, "w_max": 1.2881874740123749, "w_mean": 1.0595116317272186, "w_min": 0.25, "w_std": 0.11086289770901203 }, { "completion_length": 3573.8229370117188, "cov_mean": -5.21990023116814e-06, "cov_std": 0.09164197091013193, "entropy": 0.5361328125, "epoch": 0.052571428571428575, "grad_norm": 0.061532407999038696, "kl": 0.0024976730346679688, "learning_rate": 7.039090644965509e-07, "loss": 0.0226, "reward": 0.20833333488553762, "reward_std": 0.21344273164868355, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.1666666679084301, "step": 46, "w_high_ratio": 0.05877559259533882, "w_low_ratio": 0.011987740639597178, "w_max": 1.5505282580852509, "w_mean": 1.1429267823696136, "w_min": 3.571343117882909e-28, "w_std": 0.09135792590677738 }, { "completion_length": 3139.9375610351562, "cov_mean": 1.948155477293767e-05, "cov_std": 0.3308473080396652, "entropy": 0.43017578125, "epoch": 0.053714285714285714, "grad_norm": 0.287928968667984, "kl": 0.0011713504791259766, "learning_rate": 6.890576474687263e-07, "loss": 0.0536, "reward": 0.8541666716337204, "reward_std": 0.5766339302062988, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.5312500149011612, "step": 47, "w_high_ratio": 0.12357743084430695, "w_low_ratio": 0.038592321798205376, "w_max": 2.0394512712955475, "w_mean": 1.3565464913845062, "w_min": 0.25, "w_std": 0.26638074964284897 }, { "completion_length": 3051.385467529297, "cov_mean": -6.733167197126022e-06, "cov_std": 0.19061635434627533, "entropy": 0.4501953125, "epoch": 0.054857142857142854, "grad_norm": 0.1374855488538742, "kl": 0.00525665283203125, "learning_rate": 6.740368101176495e-07, "loss": 0.0486, "reward": 0.5208333432674408, "reward_std": 0.37667082995176315, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.34375000838190317, "step": 48, "w_high_ratio": 0.12677159160375595, "w_low_ratio": 0.023622059728950262, "w_max": 2.024912714958191, "w_mean": 1.3519074320793152, "w_min": 7.271374424684445e-33, "w_std": 0.19886896945536137 }, { "completion_length": 2580.0521850585938, "cov_mean": 2.3781666641298216e-05, "cov_std": 0.2574050724506378, "entropy": 0.39306640625, "epoch": 0.056, "grad_norm": 0.13522955775260925, "kl": 0.0030527114868164062, "learning_rate": 6.588648530198504e-07, "loss": 0.0379, "reward": 0.8020833730697632, "reward_std": 0.47789302468299866, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.5937500149011612, "step": 49, "w_high_ratio": 0.0, "w_low_ratio": 0.03599585313349962, "w_max": 1.5133522152900696, "w_mean": 1.177164077758789, "w_min": 3.952712643244228e-41, "w_std": 0.19642843678593636 }, { "completion_length": 3276.2188110351562, "cov_mean": 3.7818183841409336e-05, "cov_std": 0.1878571268171072, "entropy": 0.36767578125, "epoch": 0.05714285714285714, "grad_norm": 0.10174579173326492, "kl": 0.0021190643310546875, "learning_rate": 6.435602608679916e-07, "loss": 0.04, "reward": 0.5937500102445483, "reward_std": 0.3714478053152561, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.3333333386108279, "step": 50, "w_high_ratio": 0.0, "w_low_ratio": 0.0255408501252532, "w_max": 1.3753422796726227, "w_mean": 1.1391299068927765, "w_min": 0.0, "w_std": 0.12728617619723082 }, { "completion_length": 2626.7084350585938, "cov_mean": -8.886720934242476e-06, "cov_std": 0.154384421184659, "entropy": 0.46826171875, "epoch": 0.05828571428571429, "grad_norm": 0.10762708634138107, "kl": 0.006221771240234375, "learning_rate": 6.281416799501187e-07, "loss": 0.038, "reward": 0.5416666865348816, "reward_std": 0.27000611275434494, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.4583333358168602, "step": 51, "w_high_ratio": 0.0, "w_low_ratio": 0.022287086583673954, "w_max": 1.522942990064621, "w_mean": 1.16915962100029, "w_min": 0.25, "w_std": 0.10866253450512886 }, { "completion_length": 3225.6875610351562, "cov_mean": -3.420543362153694e-05, "cov_std": 0.3140456900000572, "entropy": 0.41357421875, "epoch": 0.05942857142857143, "grad_norm": 0.16037067770957947, "kl": 0.0017061233520507812, "learning_rate": 6.126278954320294e-07, "loss": 0.0307, "reward": 0.833333358168602, "reward_std": 0.5814172253012657, "rewards/accuracy_reward": 0.3854166865348816, "rewards/format_reward": 0.4479166716337204, "step": 52, "w_high_ratio": 0.0, "w_low_ratio": 0.041595788672566414, "w_max": 1.4235666990280151, "w_mean": 1.1377580165863037, "w_min": 0.25, "w_std": 0.20165112614631653 }, { "completion_length": 3038.3126220703125, "cov_mean": 3.059411119465949e-05, "cov_std": 0.34266950748860836, "entropy": 0.43994140625, "epoch": 0.060571428571428575, "grad_norm": 0.18063929677009583, "kl": 0.004637241363525391, "learning_rate": 5.97037808470444e-07, "loss": 0.0228, "reward": 0.9062500447034836, "reward_std": 0.5983624011278152, "rewards/accuracy_reward": 0.3229166828095913, "rewards/format_reward": 0.583333358168602, "step": 53, "w_high_ratio": 0.0893278568983078, "w_low_ratio": 0.048161128303036094, "w_max": 1.605346292257309, "w_mean": 1.2202682793140411, "w_min": 3.5021185811519566e-32, "w_std": 0.2196234930306673 }, { "completion_length": 3048.3021850585938, "cov_mean": -3.5217308322899044e-05, "cov_std": 0.40455804020166397, "entropy": 0.4111328125, "epoch": 0.061714285714285715, "grad_norm": 0.26634302735328674, "kl": 0.0014967918395996094, "learning_rate": 5.813904131848564e-07, "loss": 0.0485, "reward": 1.0104167088866234, "reward_std": 0.6799703985452652, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.5729166828095913, "step": 54, "w_high_ratio": 0.14361883699893951, "w_low_ratio": 0.03988745156675577, "w_max": 1.914384812116623, "w_mean": 1.3435330390930176, "w_min": 0.0, "w_std": 0.2648167684674263 }, { "completion_length": 3372.3959350585938, "cov_mean": 5.4979325341264484e-06, "cov_std": 0.25056118331849575, "entropy": 0.44189453125, "epoch": 0.06285714285714286, "grad_norm": 0.0977473184466362, "kl": 0.0019044876098632812, "learning_rate": 5.657047735161255e-07, "loss": 0.0381, "reward": 0.6562500223517418, "reward_std": 0.4964308738708496, "rewards/accuracy_reward": 0.27083334140479565, "rewards/format_reward": 0.3854166716337204, "step": 55, "w_high_ratio": 0.0, "w_low_ratio": 0.0342027700971812, "w_max": 1.4320927858352661, "w_mean": 1.1150383353233337, "w_min": 0.0, "w_std": 0.18786128982901573 }, { "completion_length": 3225.2709045410156, "cov_mean": 5.9108706409460865e-06, "cov_std": 0.22232061624526978, "entropy": 0.43115234375, "epoch": 0.064, "grad_norm": 0.11878591775894165, "kl": 0.00154876708984375, "learning_rate": 5.5e-07, "loss": 0.0112, "reward": 0.6458333656191826, "reward_std": 0.39009611308574677, "rewards/accuracy_reward": 0.19791666697710752, "rewards/format_reward": 0.4479166828095913, "step": 56, "w_high_ratio": 0.0, "w_low_ratio": 0.028656802838668227, "w_max": 1.5001116394996643, "w_mean": 1.2033225297927856, "w_min": 1.9247332911380988e-31, "w_std": 0.18417230807244778 }, { "completion_length": 3587.6458740234375, "cov_mean": -4.519502726907376e-05, "cov_std": 0.24361642450094223, "entropy": 0.34423828125, "epoch": 0.06514285714285714, "grad_norm": 0.11000871658325195, "kl": 0.0006794929504394531, "learning_rate": 5.342952264838747e-07, "loss": 0.0423, "reward": 0.4375, "reward_std": 0.467288788408041, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.3020833358168602, "step": 57, "w_high_ratio": 0.0, "w_low_ratio": 0.03233239706605673, "w_max": 1.2505627870559692, "w_mean": 1.0757884085178375, "w_min": 0.25, "w_std": 0.14500370249152184 }, { "completion_length": 2626.947998046875, "cov_mean": -1.4388041108759353e-05, "cov_std": 0.2534067742526531, "entropy": 0.38427734375, "epoch": 0.06628571428571428, "grad_norm": 0.15062791109085083, "kl": 0.0038776397705078125, "learning_rate": 5.186095868151436e-07, "loss": 0.0665, "reward": 0.916666716337204, "reward_std": 0.43652553856372833, "rewards/accuracy_reward": 0.2812500009313226, "rewards/format_reward": 0.635416692122817, "step": 58, "w_high_ratio": 0.12001378461718559, "w_low_ratio": 0.03707017982378602, "w_max": 2.2393843233585358, "w_mean": 1.5285146832466125, "w_min": 2.338311714957214e-41, "w_std": 0.2577071785926819 }, { "completion_length": 3455.0208740234375, "cov_mean": -2.0748303086293163e-05, "cov_std": 0.2206678595393896, "entropy": 0.3994140625, "epoch": 0.06742857142857143, "grad_norm": 0.1635066419839859, "kl": 0.0013880729675292969, "learning_rate": 5.02962191529556e-07, "loss": 0.0447, "reward": 0.39583333395421505, "reward_std": 0.3752100467681885, "rewards/accuracy_reward": 0.14583333861082792, "rewards/format_reward": 0.2500000102445483, "step": 59, "w_high_ratio": 0.121368907392025, "w_low_ratio": 0.02756796986795962, "w_max": 1.8036501705646515, "w_mean": 1.2463297247886658, "w_min": 0.25, "w_std": 0.14911611750721931 }, { "completion_length": 3254.92724609375, "cov_mean": -1.7401176137354923e-05, "cov_std": 0.17664196342229843, "entropy": 0.396484375, "epoch": 0.06857142857142857, "grad_norm": 0.08288750052452087, "kl": 0.0025072097778320312, "learning_rate": 4.873721045679706e-07, "loss": 0.0281, "reward": 0.4583333432674408, "reward_std": 0.39030885696411133, "rewards/accuracy_reward": 0.11458333488553762, "rewards/format_reward": 0.34375000558793545, "step": 60, "w_high_ratio": 0.0, "w_low_ratio": 0.022112081991508603, "w_max": 1.5709031820297241, "w_mean": 1.1311749517917633, "w_min": 4.6449540846206874e-42, "w_std": 0.12603357434272766 }, { "completion_length": 3376.6771850585938, "cov_mean": 2.7930617193305807e-05, "cov_std": 0.22145200800150633, "entropy": 0.400390625, "epoch": 0.06971428571428571, "grad_norm": 0.2130555361509323, "kl": 0.0013718605041503906, "learning_rate": 4.7185832004988133e-07, "loss": 0.0505, "reward": 0.6666666669771075, "reward_std": 0.4248874858021736, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.4687500102445483, "step": 61, "w_high_ratio": 0.02656024508178234, "w_low_ratio": 0.02897683286573738, "w_max": 1.5726596117019653, "w_mean": 1.1504198908805847, "w_min": 0.25, "w_std": 0.15992471296340227 }, { "completion_length": 2804.3438110351562, "cov_mean": 5.110432311994373e-05, "cov_std": 0.3370564728975296, "entropy": 0.385986328125, "epoch": 0.07085714285714285, "grad_norm": 0.20217926800251007, "kl": 0.0056667327880859375, "learning_rate": 4.5643973913200837e-07, "loss": 0.0984, "reward": 0.854166679084301, "reward_std": 0.6372481435537338, "rewards/accuracy_reward": 0.29166667722165585, "rewards/format_reward": 0.5625000074505806, "step": 62, "w_high_ratio": 0.15436114370822906, "w_low_ratio": 0.04324930440634489, "w_max": 2.3450452983379364, "w_mean": 1.4747015237808228, "w_min": 2.7272514644043088e-36, "w_std": 0.31457675993442535 }, { "completion_length": 2748.1876220703125, "cov_mean": 9.652720564190531e-06, "cov_std": 0.3283480554819107, "entropy": 0.43310546875, "epoch": 0.072, "grad_norm": 0.15674079954624176, "kl": 0.0032906532287597656, "learning_rate": 4.4113514698014953e-07, "loss": 0.0725, "reward": 1.031250037252903, "reward_std": 0.5360563546419144, "rewards/accuracy_reward": 0.3541666753590107, "rewards/format_reward": 0.6770833432674408, "step": 63, "w_high_ratio": 0.0625, "w_low_ratio": 0.04385069524869323, "w_max": 1.8759834170341492, "w_mean": 1.3464274108409882, "w_min": 0.0, "w_std": 0.1958361305296421 }, { "completion_length": 3358.3334350585938, "cov_mean": -4.510660664891475e-05, "cov_std": 0.2794957533478737, "entropy": 0.44921875, "epoch": 0.07314285714285715, "grad_norm": 0.12678052484989166, "kl": 0.005690097808837891, "learning_rate": 4.2596318988235037e-07, "loss": 0.0603, "reward": 0.5937500149011612, "reward_std": 0.583733007311821, "rewards/accuracy_reward": 0.2187500037252903, "rewards/format_reward": 0.3750000074505806, "step": 64, "w_high_ratio": 0.044796403497457504, "w_low_ratio": 0.03829633165150881, "w_max": 1.6352408528327942, "w_mean": 1.167921930551529, "w_min": 2.6764800668604006e-43, "w_std": 0.20047394558787346 }, { "completion_length": 3026.5209350585938, "cov_mean": 1.992606485146098e-05, "cov_std": 0.17664698883891106, "entropy": 0.38525390625, "epoch": 0.07428571428571429, "grad_norm": 0.08567796647548676, "kl": 0.0034112930297851562, "learning_rate": 4.1094235253127374e-07, "loss": 0.0309, "reward": 0.6250000260770321, "reward_std": 0.32293669879436493, "rewards/accuracy_reward": 0.16666666697710752, "rewards/format_reward": 0.4583333395421505, "step": 65, "w_high_ratio": 0.0, "w_low_ratio": 0.029133206233382225, "w_max": 1.3120096027851105, "w_mean": 1.1221435964107513, "w_min": 1.0468715588988584e-22, "w_std": 0.12845914252102375 }, { "completion_length": 2413.1250610351562, "cov_mean": -4.743512135974015e-06, "cov_std": 0.05960770323872566, "entropy": 0.35693359375, "epoch": 0.07542857142857143, "grad_norm": 0.08712891489267349, "kl": 0.0032825469970703125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0194, "reward": 0.8645833432674408, "reward_std": 0.13795074447989464, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.4895833358168602, "step": 66, "w_high_ratio": 0.04171403869986534, "w_low_ratio": 0.006300564622506499, "w_max": 1.5700030624866486, "w_mean": 1.1926406025886536, "w_min": 0.5264238715171814, "w_std": 0.06322706118226051 }, { "completion_length": 3792.9271240234375, "cov_mean": -9.492634717389592e-06, "cov_std": 0.12886795960366726, "entropy": 0.36865234375, "epoch": 0.07657142857142857, "grad_norm": 0.0828777328133583, "kl": 0.002822399139404297, "learning_rate": 3.8142703296283953e-07, "loss": 0.0179, "reward": 0.21875000558793545, "reward_std": 0.2869785502552986, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.16666667442768812, "step": 67, "w_high_ratio": 0.0, "w_low_ratio": 0.017554222606122494, "w_max": 1.1846114993095398, "w_mean": 1.0396882444620132, "w_min": 0.25, "w_std": 0.09070100169628859 }, { "completion_length": 2557.6146240234375, "cov_mean": 1.5981570413714508e-05, "cov_std": 0.31408151611685753, "entropy": 0.44384765625, "epoch": 0.07771428571428571, "grad_norm": 0.20466886460781097, "kl": 0.0033721923828125, "learning_rate": 3.6696851061588994e-07, "loss": 0.0566, "reward": 0.8020833507180214, "reward_std": 0.5168246552348137, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.5625000149011612, "step": 68, "w_high_ratio": 0.04399501532316208, "w_low_ratio": 0.042061637388542295, "w_max": 1.8962246477603912, "w_mean": 1.305674433708191, "w_min": 4.959685019058809e-39, "w_std": 0.2475343719124794 }, { "completion_length": 3009.7084350585938, "cov_mean": -4.52714293714962e-05, "cov_std": 0.23395150154829025, "entropy": 0.5517578125, "epoch": 0.07885714285714286, "grad_norm": 0.18218518793582916, "kl": 0.014312744140625, "learning_rate": 3.5273298394491515e-07, "loss": 0.0752, "reward": 0.4687500149011612, "reward_std": 0.43293242901563644, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.37500000558793545, "step": 69, "w_high_ratio": 0.057301439344882965, "w_low_ratio": 0.0333517212420702, "w_max": 1.9855602085590363, "w_mean": 1.2992196083068848, "w_min": 7.707141553786494e-45, "w_std": 0.19260139763355255 }, { "completion_length": 3280.0833740234375, "cov_mean": 2.8561088129208656e-05, "cov_std": 0.19498306885361671, "entropy": 0.3720703125, "epoch": 0.08, "grad_norm": 0.10543849319219589, "kl": 0.010352134704589844, "learning_rate": 3.387377967463493e-07, "loss": 0.0185, "reward": 0.5416667014360428, "reward_std": 0.3840207904577255, "rewards/accuracy_reward": 0.1250000074505806, "rewards/format_reward": 0.4166666716337204, "step": 70, "w_high_ratio": 0.0, "w_low_ratio": 0.02808787301182747, "w_max": 1.2882064878940582, "w_mean": 1.0968604385852814, "w_min": 0.25, "w_std": 0.1326066516339779 }, { "completion_length": 2855.7708740234375, "cov_mean": 4.004434208582097e-05, "cov_std": 0.13545112498104572, "entropy": 0.42724609375, "epoch": 0.08114285714285714, "grad_norm": 0.11269883066415787, "kl": 0.014951705932617188, "learning_rate": 3.250000000000001e-07, "loss": 0.0154, "reward": 0.5729166967794299, "reward_std": 0.2492993399500847, "rewards/accuracy_reward": 0.1979166716337204, "rewards/format_reward": 0.3750000027939677, "step": 71, "w_high_ratio": 0.08807118237018585, "w_low_ratio": 0.01654834917280823, "w_max": 1.8029770255088806, "w_mean": 1.2483810186386108, "w_min": 0.25, "w_std": 0.1558239422738552 }, { "completion_length": 3550.322998046875, "cov_mean": -4.4116359276813455e-07, "cov_std": 0.24292385205626488, "entropy": 0.5107421875, "epoch": 0.08228571428571428, "grad_norm": 0.20913389325141907, "kl": 0.0040874481201171875, "learning_rate": 3.115363310950578e-07, "loss": 0.0498, "reward": 0.3750000102445483, "reward_std": 0.40810926631093025, "rewards/accuracy_reward": 0.0729166716337204, "rewards/format_reward": 0.3020833386108279, "step": 72, "w_high_ratio": 0.033293891698122025, "w_low_ratio": 0.03450615704059601, "w_max": 1.6246004700660706, "w_mean": 1.1392557322978973, "w_min": 2.138569791073276e-36, "w_std": 0.19269496202468872 }, { "completion_length": 3837.1146240234375, "cov_mean": 2.1203804863034748e-06, "cov_std": 0.15915799140930176, "entropy": 0.51171875, "epoch": 0.08342857142857144, "grad_norm": 0.08118956536054611, "kl": 0.0014677047729492188, "learning_rate": 2.9836319343816397e-07, "loss": 0.0146, "reward": 0.2708333432674408, "reward_std": 0.25903886556625366, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.1666666716337204, "step": 73, "w_high_ratio": 0.0, "w_low_ratio": 0.020282023586332798, "w_max": 1.1690112948417664, "w_mean": 1.027027040719986, "w_min": 0.5, "w_std": 0.09119972214102745 }, { "completion_length": 3503.1250610351562, "cov_mean": -4.496445217228029e-05, "cov_std": 0.2548239603638649, "entropy": 0.41748046875, "epoch": 0.08457142857142858, "grad_norm": 0.1568731665611267, "kl": 0.00296783447265625, "learning_rate": 2.854966364683872e-07, "loss": 0.0655, "reward": 0.5625000074505806, "reward_std": 0.4641239196062088, "rewards/accuracy_reward": 0.2604166679084301, "rewards/format_reward": 0.3020833358168602, "step": 74, "w_high_ratio": 0.0, "w_low_ratio": 0.033767144195735455, "w_max": 1.4706333875656128, "w_mean": 1.078809916973114, "w_min": 0.25, "w_std": 0.1507711410522461 }, { "completion_length": 3346.1875610351562, "cov_mean": -7.328241736104246e-06, "cov_std": 0.20871411636471748, "entropy": 0.404296875, "epoch": 0.08571428571428572, "grad_norm": 0.10099554806947708, "kl": 0.0059261322021484375, "learning_rate": 2.729523361034538e-07, "loss": 0.0554, "reward": 0.572916679084301, "reward_std": 0.3865407630801201, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.3854166716337204, "step": 75, "w_high_ratio": 0.0, "w_low_ratio": 0.032186293974518776, "w_max": 1.5220047235488892, "w_mean": 1.1460089683532715, "w_min": 0.25, "w_std": 0.17850109934806824 }, { "completion_length": 3092.635498046875, "cov_mean": 4.942317445966182e-06, "cov_std": 0.20054961927235126, "entropy": 0.44091796875, "epoch": 0.08685714285714285, "grad_norm": 0.14435574412345886, "kl": 0.001827239990234375, "learning_rate": 2.6074557564105724e-07, "loss": 0.02, "reward": 0.583333358168602, "reward_std": 0.34913603961467743, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.4791666939854622, "step": 76, "w_high_ratio": 0.0, "w_low_ratio": 0.030099061783403158, "w_max": 1.574487328529358, "w_mean": 1.184053212404251, "w_min": 1.2891045632755887e-30, "w_std": 0.1306541245430708 }, { "completion_length": 3414.0938110351562, "cov_mean": 4.712834697784274e-05, "cov_std": 0.23044732213020325, "entropy": 0.45263671875, "epoch": 0.088, "grad_norm": 0.27278050780296326, "kl": 0.0013208389282226562, "learning_rate": 2.488912271385139e-07, "loss": 0.015, "reward": 0.510416672565043, "reward_std": 0.4236603006720543, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.37500001583248377, "step": 77, "w_high_ratio": 0.006981382612138987, "w_low_ratio": 0.02834776253439486, "w_max": 1.5653101801872253, "w_mean": 1.1680251359939575, "w_min": 5.385388925526598e-27, "w_std": 0.172462142072618 }, { "completion_length": 3528.5001220703125, "cov_mean": -2.2849541437608423e-05, "cov_std": 0.24372886680066586, "entropy": 0.41015625, "epoch": 0.08914285714285715, "grad_norm": 0.13359344005584717, "kl": 0.0022611618041992188, "learning_rate": 2.374037332934512e-07, "loss": 0.037, "reward": 0.6562500186264515, "reward_std": 0.5271749570965767, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.3750000111758709, "step": 78, "w_high_ratio": 0.041247133165597916, "w_low_ratio": 0.027018944965675473, "w_max": 1.6266585290431976, "w_mean": 1.1576823890209198, "w_min": 1.890902738208876e-34, "w_std": 0.16164034884423018 }, { "completion_length": 2676.187545776367, "cov_mean": 1.6489982044731732e-05, "cov_std": 0.21785889007151127, "entropy": 0.34912109375, "epoch": 0.09028571428571429, "grad_norm": 0.1319928616285324, "kl": 0.0022878646850585938, "learning_rate": 2.2629708984760706e-07, "loss": 0.0289, "reward": 0.843750037252903, "reward_std": 0.4268086552619934, "rewards/accuracy_reward": 0.26041667349636555, "rewards/format_reward": 0.5833333544433117, "step": 79, "w_high_ratio": 0.0, "w_low_ratio": 0.03145516477525234, "w_max": 1.4992458820343018, "w_mean": 1.168209046125412, "w_min": 0.0, "w_std": 0.1453277636319399 }, { "completion_length": 3573.8021240234375, "cov_mean": -9.463059541303664e-06, "cov_std": 0.1826024018228054, "entropy": 0.48974609375, "epoch": 0.09142857142857143, "grad_norm": 0.0996597409248352, "kl": 0.002468109130859375, "learning_rate": 2.1558482853517253e-07, "loss": 0.0271, "reward": 0.520833358168602, "reward_std": 0.3810138627886772, "rewards/accuracy_reward": 0.1979166679084301, "rewards/format_reward": 0.3229166753590107, "step": 80, "w_high_ratio": 0.0, "w_low_ratio": 0.02435835381038487, "w_max": 1.368900626897812, "w_mean": 1.106232464313507, "w_min": 0.25, "w_std": 0.11319147422909737 }, { "completion_length": 3308.625, "cov_mean": -1.9006092770723626e-05, "cov_std": 0.1842699982225895, "entropy": 0.58740234375, "epoch": 0.09257142857142857, "grad_norm": 0.15561415255069733, "kl": 0.005096435546875, "learning_rate": 2.0528000059645995e-07, "loss": 0.0328, "reward": 0.4062500149011612, "reward_std": 0.28905032202601433, "rewards/accuracy_reward": 0.11458333395421505, "rewards/format_reward": 0.291666679084301, "step": 81, "w_high_ratio": 0.125, "w_low_ratio": 0.02731443475931883, "w_max": 1.497319370508194, "w_mean": 1.1954041719436646, "w_min": 0.25, "w_std": 0.12777045369148254 }, { "completion_length": 3054.5938110351562, "cov_mean": -1.190575176224229e-05, "cov_std": 0.13467486761510372, "entropy": 0.453125, "epoch": 0.09371428571428571, "grad_norm": 0.06870616227388382, "kl": 0.0037078857421875, "learning_rate": 1.9539516087697517e-07, "loss": 0.0283, "reward": 0.6458333432674408, "reward_std": 0.2705298960208893, "rewards/accuracy_reward": 0.2187500074505806, "rewards/format_reward": 0.4270833432674408, "step": 82, "w_high_ratio": 0.0, "w_low_ratio": 0.016570631880313158, "w_max": 1.6605907380580902, "w_mean": 1.2332959175109863, "w_min": 0.25, "w_std": 0.10085548926144838 }, { "completion_length": 3103.7188110351562, "cov_mean": -1.1289954500171007e-05, "cov_std": 0.25786374136805534, "entropy": 0.4951171875, "epoch": 0.09485714285714286, "grad_norm": 0.11641041934490204, "kl": 0.0033721923828125, "learning_rate": 1.8594235253127372e-07, "loss": 0.0561, "reward": 0.541666679084301, "reward_std": 0.5305610671639442, "rewards/accuracy_reward": 0.17708333488553762, "rewards/format_reward": 0.3645833507180214, "step": 83, "w_high_ratio": 0.0, "w_low_ratio": 0.03598734503611922, "w_max": 1.3528369665145874, "w_mean": 1.095271646976471, "w_min": 0.0, "w_std": 0.1604925710707903 }, { "completion_length": 3278.104248046875, "cov_mean": 3.2929374356172048e-06, "cov_std": 0.246146522462368, "entropy": 0.45849609375, "epoch": 0.096, "grad_norm": 0.15934637188911438, "kl": 0.0013265609741210938, "learning_rate": 1.7693309235023127e-07, "loss": 0.0172, "reward": 0.7187500298023224, "reward_std": 0.432245634496212, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.4062500149011612, "step": 84, "w_high_ratio": 0.0, "w_low_ratio": 0.02858129981905222, "w_max": 1.45015150308609, "w_mean": 1.134983777999878, "w_min": 0.25, "w_std": 0.15726573020219803 }, { "completion_length": 3484.729248046875, "cov_mean": -8.481749773636693e-06, "cov_std": 0.29308537393808365, "entropy": 0.37841796875, "epoch": 0.09714285714285714, "grad_norm": 0.1379634290933609, "kl": 0.0017528533935546875, "learning_rate": 1.6837835672960831e-07, "loss": 0.0623, "reward": 0.5104166716337204, "reward_std": 0.6195737272500992, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.3437500037252903, "step": 85, "w_high_ratio": 0.0, "w_low_ratio": 0.03877481259405613, "w_max": 1.4117690026760101, "w_mean": 1.119108110666275, "w_min": 0.0, "w_std": 0.18827635422348976 }, { "completion_length": 3158.8125, "cov_mean": 6.843166598713424e-06, "cov_std": 0.12266075890511274, "entropy": 0.47119140625, "epoch": 0.09828571428571428, "grad_norm": 0.09657198935747147, "kl": 0.0029735565185546875, "learning_rate": 1.6028856829700258e-07, "loss": 0.0174, "reward": 0.541666679084301, "reward_std": 0.2581377625465393, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.4166666716337204, "step": 86, "w_high_ratio": 0.0, "w_low_ratio": 0.018843807047232985, "w_max": 1.3829069435596466, "w_mean": 1.131670981645584, "w_min": 0.25, "w_std": 0.11318285018205643 }, { "completion_length": 3175.2188720703125, "cov_mean": -5.853003926858946e-05, "cov_std": 0.30259813368320465, "entropy": 0.556640625, "epoch": 0.09942857142857142, "grad_norm": 0.38111400604248047, "kl": 0.007335662841796875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0778, "reward": 0.7187500149011612, "reward_std": 0.5092682540416718, "rewards/accuracy_reward": 0.2291666753590107, "rewards/format_reward": 0.4895833432674408, "step": 87, "w_high_ratio": 0.11183382570743561, "w_low_ratio": 0.03777051903307438, "w_max": 1.9268704950809479, "w_mean": 1.334629088640213, "w_min": 5.605193857299268e-45, "w_std": 0.27019689977169037 }, { "completion_length": 3190.9375610351562, "cov_mean": 7.67477886256529e-05, "cov_std": 0.46318161487579346, "entropy": 0.5048828125, "epoch": 0.10057142857142858, "grad_norm": 0.2448384165763855, "kl": 0.020694732666015625, "learning_rate": 1.4554267916537495e-07, "loss": 0.0906, "reward": 0.7604167014360428, "reward_std": 0.7656450867652893, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.4687500149011612, "step": 88, "w_high_ratio": 0.08163053542375565, "w_low_ratio": 0.06242929771542549, "w_max": 1.8597923815250397, "w_mean": 1.2850928604602814, "w_min": 2.421886516239847e-28, "w_std": 0.3251073509454727 }, { "completion_length": 3597.2396850585938, "cov_mean": -4.437502229848178e-05, "cov_std": 0.29206302016973495, "entropy": 0.45703125, "epoch": 0.10171428571428572, "grad_norm": 0.16698718070983887, "kl": 0.00389862060546875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0647, "reward": 0.4687500149011612, "reward_std": 0.5414880514144897, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.291666679084301, "step": 89, "w_high_ratio": 0.0, "w_low_ratio": 0.04101241147145629, "w_max": 1.4729963839054108, "w_mean": 1.0995305478572845, "w_min": 0.0, "w_std": 0.19675205275416374 }, { "completion_length": 2862.125, "cov_mean": -5.355579560273327e-06, "cov_std": 0.0925431028008461, "entropy": 0.638671875, "epoch": 0.10285714285714286, "grad_norm": 0.09346118569374084, "kl": 0.01725006103515625, "learning_rate": 1.3276726544494571e-07, "loss": 0.0269, "reward": 0.4791666716337204, "reward_std": 0.19776283204555511, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.4583333432674408, "step": 90, "w_high_ratio": 0.12039810419082642, "w_low_ratio": 0.014701983891427517, "w_max": 2.006953328847885, "w_mean": 1.3530822694301605, "w_min": 0.5, "w_std": 0.09125572815537453 }, { "completion_length": 3527.2813110351562, "cov_mean": -6.6539573708723765e-06, "cov_std": 0.19490721449255943, "entropy": 0.47802734375, "epoch": 0.104, "grad_norm": 0.16878993809223175, "kl": 0.0045948028564453125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0023, "reward": 0.48958333395421505, "reward_std": 0.31570227444171906, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.3125000027939677, "step": 91, "w_high_ratio": 0.0432400144636631, "w_low_ratio": 0.025078749749809504, "w_max": 1.4252241849899292, "w_mean": 1.1511092782020569, "w_min": 0.25, "w_std": 0.14859570004045963 }, { "completion_length": 2996.8333740234375, "cov_mean": 5.000362762075383e-06, "cov_std": 0.274563018232584, "entropy": 0.44921875, "epoch": 0.10514285714285715, "grad_norm": 0.2672693729400635, "kl": 0.011915206909179688, "learning_rate": 1.220245676671809e-07, "loss": 0.0151, "reward": 0.6875000223517418, "reward_std": 0.3997742757201195, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.5312500074505806, "step": 92, "w_high_ratio": 0.057363301515579224, "w_low_ratio": 0.03872442920692265, "w_max": 1.9733782410621643, "w_mean": 1.2820636332035065, "w_min": 1.1237891646640876e-26, "w_std": 0.21614115312695503 }, { "completion_length": 3868.3333740234375, "cov_mean": 2.6024475801023073e-05, "cov_std": 0.11580366268754005, "entropy": 0.59130859375, "epoch": 0.10628571428571429, "grad_norm": 0.07178976386785507, "kl": 0.00457763671875, "learning_rate": 1.1743223682775649e-07, "loss": 0.0217, "reward": 0.10416666883975267, "reward_std": 0.23858631029725075, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.08333333488553762, "step": 93, "w_high_ratio": 0.0, "w_low_ratio": 0.022031503496691585, "w_max": 1.2298710346221924, "w_mean": 1.0156493484973907, "w_min": 9.954022423630139e-25, "w_std": 0.08076347131282091 }, { "completion_length": 3368.0834350585938, "cov_mean": -9.980105346585333e-07, "cov_std": 0.14382942207157612, "entropy": 0.56640625, "epoch": 0.10742857142857143, "grad_norm": 0.09354749321937561, "kl": 0.010030746459960938, "learning_rate": 1.1336692317580158e-07, "loss": 0.0167, "reward": 0.479166679084301, "reward_std": 0.26436545327305794, "rewards/accuracy_reward": 0.15625000651925802, "rewards/format_reward": 0.3229166679084301, "step": 94, "w_high_ratio": 0.0, "w_low_ratio": 0.020133810699917376, "w_max": 1.4812421798706055, "w_mean": 1.140279084444046, "w_min": 2.8643974210955766e-17, "w_std": 0.10274781100451946 }, { "completion_length": 3727.229248046875, "cov_mean": 2.8429121812223457e-07, "cov_std": 0.1946401260793209, "entropy": 0.4609375, "epoch": 0.10857142857142857, "grad_norm": 0.09702739864587784, "kl": 0.0018458366394042969, "learning_rate": 1.0983357966978745e-07, "loss": 0.039, "reward": 0.2916666707023978, "reward_std": 0.4342379942536354, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.21875000838190317, "step": 95, "w_high_ratio": 0.0, "w_low_ratio": 0.027697827550582588, "w_max": 1.2496315836906433, "w_mean": 1.0440161526203156, "w_min": 0.0, "w_std": 0.13094050344079733 }, { "completion_length": 3174.6876220703125, "cov_mean": 0.00011378643011994427, "cov_std": 0.20376956462860107, "entropy": 0.4482421875, "epoch": 0.10971428571428571, "grad_norm": 0.13061358034610748, "kl": 0.005021095275878906, "learning_rate": 1.068365111445064e-07, "loss": 0.007, "reward": 0.6562500074505806, "reward_std": 0.3372773453593254, "rewards/accuracy_reward": 0.2812500037252903, "rewards/format_reward": 0.3750000074505806, "step": 96, "w_high_ratio": 0.08483665436506271, "w_low_ratio": 0.023257225286215544, "w_max": 1.6405883729457855, "w_mean": 1.170585960149765, "w_min": 0.25, "w_std": 0.1635773852467537 }, { "completion_length": 3578.6458740234375, "cov_mean": 5.654522146869567e-05, "cov_std": 0.2582091810181737, "entropy": 0.4716796875, "epoch": 0.11085714285714286, "grad_norm": 0.23674072325229645, "kl": 0.0026683807373046875, "learning_rate": 1.0437936906629334e-07, "loss": 0.0282, "reward": 0.48958334419876337, "reward_std": 0.4029072895646095, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.27083334140479565, "step": 97, "w_high_ratio": 0.0427275113761425, "w_low_ratio": 0.02712295390665531, "w_max": 1.5456224977970123, "w_mean": 1.134882390499115, "w_min": 8.233329127140463e-42, "w_std": 0.17803996708244085 }, { "completion_length": 3275.70849609375, "cov_mean": -2.5506165911792777e-05, "cov_std": 0.22730276361107826, "entropy": 0.43359375, "epoch": 0.112, "grad_norm": 0.154100239276886, "kl": 0.0016241073608398438, "learning_rate": 1.0246514708427701e-07, "loss": 0.0791, "reward": 0.5208333432674408, "reward_std": 0.45262154936790466, "rewards/accuracy_reward": 0.14583333674818277, "rewards/format_reward": 0.3750000149011612, "step": 98, "w_high_ratio": 0.031526632606983185, "w_low_ratio": 0.030860408674925566, "w_max": 1.7042989134788513, "w_mean": 1.219407707452774, "w_min": 1.0509738482436128e-44, "w_std": 0.1900232806801796 }, { "completion_length": 3084.593795776367, "cov_mean": -6.495133902717498e-06, "cov_std": 0.13577165454626083, "entropy": 0.386474609375, "epoch": 0.11314285714285714, "grad_norm": 0.07463299483060837, "kl": 0.008008956909179688, "learning_rate": 1.0109617738307911e-07, "loss": 0.0185, "reward": 0.5208333488553762, "reward_std": 0.2819661721587181, "rewards/accuracy_reward": 0.19791667815297842, "rewards/format_reward": 0.3229166669771075, "step": 99, "w_high_ratio": 0.04817802831530571, "w_low_ratio": 0.017340978607535362, "w_max": 1.389538049697876, "w_mean": 1.1735607981681824, "w_min": 0.25, "w_std": 0.1109000938013196 }, { "completion_length": 3224.5834350585938, "cov_mean": -4.5620060973305954e-05, "cov_std": 0.26372817903757095, "entropy": 0.42724609375, "epoch": 0.11428571428571428, "grad_norm": 0.13250450789928436, "kl": 0.009317398071289062, "learning_rate": 1.002741278414069e-07, "loss": 0.0352, "reward": 0.7083333656191826, "reward_std": 0.4755344055593014, "rewards/accuracy_reward": 0.260416679084301, "rewards/format_reward": 0.447916679084301, "step": 100, "w_high_ratio": 0.05065765231847763, "w_low_ratio": 0.03619965072721243, "w_max": 1.6908635199069977, "w_mean": 1.1874340772628784, "w_min": 1.2465886678904214e-38, "w_std": 0.2075340449810028 }, { "epoch": 0.11428571428571428, "step": 100, "total_flos": 0.0, "train_loss": 0.041623960277065636, "train_runtime": 8415.8875, "train_samples_per_second": 1.141, "train_steps_per_second": 0.012 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }