{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 330, "global_step": 1650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.0006062443164595332, "grad_norm": 1.5400323867797852, "kl": 5.352497100830078e-05, "learning_rate": 0.0, "loss": 0.0, "reward": 0.0024800303508527577, "reward_std": 0.10826849192380905, "rewards/ndcg_rule_reward": -0.022910594940185547, "rewards/rule_reward": 0.025390625, "step": 1, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.0012124886329190664, "grad_norm": 1.5513525009155273, "kl": 0.00011777877807617188, "learning_rate": 1.0101010101010103e-07, "loss": 0.0, "reward": 0.0048056356608867645, "reward_std": 0.14086522161960602, "rewards/ndcg_rule_reward": -0.028397489339113235, "rewards/rule_reward": 0.033203125, "step": 2, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.0018187329493785996, "grad_norm": 2.303806781768799, "kl": 0.0004100799560546875, "learning_rate": 2.0202020202020205e-07, "loss": 0.0, "reward": 0.0038862600922584534, "reward_std": 0.1159985214471817, "rewards/ndcg_rule_reward": -0.02345749083906412, "rewards/rule_reward": 0.02734375, "step": 3, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.002424977265838133, "grad_norm": 1.1126917600631714, "kl": 0.0002827644348144531, "learning_rate": 3.0303030303030305e-07, "loss": 0.0, "reward": 0.0036368253640830517, "reward_std": 0.1498279646039009, "rewards/ndcg_rule_reward": -0.031519425101578236, "rewards/rule_reward": 0.03515625, "step": 4, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.003031221582297666, "grad_norm": 1.480296015739441, "kl": 0.00022792816162109375, "learning_rate": 4.040404040404041e-07, "loss": 0.0, "reward": 0.005282615777105093, "reward_std": 0.14898599684238434, "rewards/ndcg_rule_reward": -0.02987363375723362, "rewards/rule_reward": 0.03515625, "step": 5, "token_diversity": 0.52990625 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.003637465898757199, "grad_norm": 1.0779246091842651, "kl": 0.00015020370483398438, "learning_rate": 5.05050505050505e-07, "loss": 0.0, "reward": 0.002351373084820807, "reward_std": 0.08307912200689316, "rewards/ndcg_rule_reward": -0.017179876565933228, "rewards/rule_reward": 0.01953125, "step": 6, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.004243710215216732, "grad_norm": 1.721088171005249, "kl": 0.0001647472381591797, "learning_rate": 6.060606060606061e-07, "loss": 0.0, "reward": 0.004618021659553051, "reward_std": 0.14092640578746796, "rewards/ndcg_rule_reward": -0.02858510334044695, "rewards/rule_reward": 0.033203125, "step": 7, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.004849954531676266, "grad_norm": 1.1469007730484009, "kl": 0.0001392364501953125, "learning_rate": 7.070707070707071e-07, "loss": 0.0, "reward": 0.002999443095177412, "reward_std": 0.09961133450269699, "rewards/ndcg_rule_reward": -0.02043805830180645, "rewards/rule_reward": 0.0234375, "step": 8, "token_diversity": 0.50190625 }, { "categorical_diversity": 1.0, "completion_length": 5.009765625, "epoch": 0.005456198848135798, "grad_norm": 1.1377488374710083, "kl": 0.00029850006103515625, "learning_rate": 8.080808080808082e-07, "loss": 0.0, "reward": 0.0026342757046222687, "reward_std": 0.09135720506310463, "rewards/ndcg_rule_reward": -0.01885009976103902, "rewards/rule_reward": 0.021484375, "step": 9, "token_diversity": 0.58203125 }, { "categorical_diversity": 1.0, "completion_length": 4.970703125, "epoch": 0.006062443164595332, "grad_norm": 2.147977828979492, "kl": 0.000713348388671875, "learning_rate": 9.090909090909091e-07, "loss": 0.0, "reward": 0.00528041273355484, "reward_std": 0.15745288133621216, "rewards/ndcg_rule_reward": -0.03182896226644516, "rewards/rule_reward": 0.037109375, "step": 10, "token_diversity": 0.517625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.006668687481054865, "grad_norm": 1.459490418434143, "kl": 0.001140594482421875, "learning_rate": 1.01010101010101e-06, "loss": 0.0, "reward": 0.004008835647255182, "reward_std": 0.13277767598628998, "rewards/ndcg_rule_reward": -0.02724116388708353, "rewards/rule_reward": 0.03125, "step": 11, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.083984375, "epoch": 0.007274931797514398, "grad_norm": 1.2936521768569946, "kl": 0.00131988525390625, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "reward": 0.0027951786760240793, "reward_std": 0.12492906674742699, "rewards/ndcg_rule_reward": -0.026501696556806564, "rewards/rule_reward": 0.029296875, "step": 12, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.007881176113973931, "grad_norm": 1.238607406616211, "kl": 0.0015749931335449219, "learning_rate": 1.2121212121212122e-06, "loss": 0.0, "reward": 0.002825871401000768, "reward_std": 0.10809551179409027, "rewards/ndcg_rule_reward": -0.022564752958714962, "rewards/rule_reward": 0.025390625, "step": 13, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 4.96484375, "epoch": 0.008487420430433464, "grad_norm": 2.6915411949157715, "kl": 0.0065460205078125, "learning_rate": 1.3131313131313134e-06, "loss": 0.0, "reward": 0.003642322728410363, "reward_std": 0.10770586133003235, "rewards/ndcg_rule_reward": -0.02174830250442028, "rewards/rule_reward": 0.025390625, "step": 14, "token_diversity": 0.48209375 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.009093664746892998, "grad_norm": 2.5462663173675537, "kl": 0.02691650390625, "learning_rate": 1.4141414141414143e-06, "loss": 0.0, "reward": 0.0043973877327516675, "reward_std": 0.14103276282548904, "rewards/ndcg_rule_reward": -0.028805739246308804, "rewards/rule_reward": 0.033203125, "step": 15, "token_diversity": 0.54534375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.009699909063352531, "grad_norm": 2.1059048175811768, "kl": 0.101318359375, "learning_rate": 1.5151515151515152e-06, "loss": 0.0001, "reward": 0.004753627348691225, "reward_std": 0.14928580820560455, "rewards/ndcg_rule_reward": -0.030402623116970062, "rewards/rule_reward": 0.03515625, "step": 16, "token_diversity": 0.5294375 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.010306153379812064, "grad_norm": 10.17150592803955, "kl": 0.08991527557373047, "learning_rate": 1.6161616161616164e-06, "loss": 0.0001, "reward": 0.004180497257038951, "reward_std": 0.14109767228364944, "rewards/ndcg_rule_reward": -0.029022627510130405, "rewards/rule_reward": 0.033203125, "step": 17, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.010912397696271597, "grad_norm": 1.1146018505096436, "kl": 0.00701904296875, "learning_rate": 1.7171717171717173e-06, "loss": 0.0, "reward": 0.002153917623218149, "reward_std": 0.0915323905646801, "rewards/ndcg_rule_reward": -0.019330458249896765, "rewards/rule_reward": 0.021484375, "step": 18, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.011518642012731131, "grad_norm": 1.5036029815673828, "kl": 0.2171630859375, "learning_rate": 1.8181818181818183e-06, "loss": 0.0002, "reward": 0.003524256870150566, "reward_std": 0.09934735298156738, "rewards/ndcg_rule_reward": -0.019913243129849434, "rewards/rule_reward": 0.0234375, "step": 19, "token_diversity": 0.50153125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.012124886329190664, "grad_norm": 1.6304129362106323, "kl": 3.22686767578125, "learning_rate": 1.9191919191919192e-06, "loss": 0.0032, "reward": 0.004260817542672157, "reward_std": 0.12424533814191818, "rewards/ndcg_rule_reward": -0.025036057457327843, "rewards/rule_reward": 0.029296875, "step": 20, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.012731130645650197, "grad_norm": 1.4796206951141357, "kl": 3.141162872314453, "learning_rate": 2.02020202020202e-06, "loss": 0.0032, "reward": 0.0031266604783013463, "reward_std": 0.11636200174689293, "rewards/ndcg_rule_reward": -0.024217089638113976, "rewards/rule_reward": 0.02734375, "step": 21, "token_diversity": 0.4349375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.01333737496210973, "grad_norm": 0.870411217212677, "kl": 0.001003265380859375, "learning_rate": 2.1212121212121216e-06, "loss": 0.0, "reward": 0.002960484125651419, "reward_std": 0.11645891144871712, "rewards/ndcg_rule_reward": -0.024383265525102615, "rewards/rule_reward": 0.02734375, "step": 22, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.013943619278569264, "grad_norm": 1.2558519840240479, "kl": 3.0703125, "learning_rate": 2.222222222222222e-06, "loss": 0.0031, "reward": 0.0026481845416128635, "reward_std": 0.07449174299836159, "rewards/ndcg_rule_reward": -0.014929940458387136, "rewards/rule_reward": 0.017578125, "step": 23, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.014549863595028797, "grad_norm": 2.1293444633483887, "kl": 5.640625, "learning_rate": 2.3232323232323234e-06, "loss": 0.0056, "reward": 0.00443076656665653, "reward_std": 0.14099161326885223, "rewards/ndcg_rule_reward": -0.02877235785126686, "rewards/rule_reward": 0.033203125, "step": 24, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.01515610791148833, "grad_norm": 75.62908935546875, "kl": 100.51953125, "learning_rate": 2.4242424242424244e-06, "loss": 0.1008, "reward": 0.004420982673764229, "reward_std": 0.1409856602549553, "rewards/ndcg_rule_reward": -0.02878214232623577, "rewards/rule_reward": 0.033203125, "step": 25, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.015762352227947862, "grad_norm": 46.764888763427734, "kl": 99.75, "learning_rate": 2.5252525252525258e-06, "loss": 0.0995, "reward": 0.002811357262544334, "reward_std": 0.08284028433263302, "rewards/ndcg_rule_reward": -0.0167198923882097, "rewards/rule_reward": 0.01953125, "step": 26, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.016368596544407397, "grad_norm": 1.0488685369491577, "kl": 0.4440460205078125, "learning_rate": 2.6262626262626267e-06, "loss": 0.0004, "reward": 0.0028164212126284838, "reward_std": 0.09965949133038521, "rewards/ndcg_rule_reward": -0.02062107902020216, "rewards/rule_reward": 0.0234375, "step": 27, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.016974840860866928, "grad_norm": 8.555087089538574, "kl": 11.9571533203125, "learning_rate": 2.7272727272727272e-06, "loss": 0.012, "reward": 0.0014051207690499723, "reward_std": 0.05826898664236069, "rewards/ndcg_rule_reward": -0.012266754172742367, "rewards/rule_reward": 0.013671875, "step": 28, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.017581085177326462, "grad_norm": 24.12587547302246, "kl": 52.875, "learning_rate": 2.8282828282828286e-06, "loss": 0.0528, "reward": 0.0033741958905011415, "reward_std": 0.1246550977230072, "rewards/ndcg_rule_reward": -0.025922680273652077, "rewards/rule_reward": 0.029296875, "step": 29, "token_diversity": 0.55078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.018187329493785997, "grad_norm": 1.3126888275146484, "kl": 0.685546875, "learning_rate": 2.9292929292929295e-06, "loss": 0.0007, "reward": 0.0028985905228182673, "reward_std": 0.10806796327233315, "rewards/ndcg_rule_reward": -0.02249203436076641, "rewards/rule_reward": 0.025390625, "step": 30, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.018793573810245528, "grad_norm": 23.88640594482422, "kl": 47.2197265625, "learning_rate": 3.0303030303030305e-06, "loss": 0.0472, "reward": 0.003151086624711752, "reward_std": 0.09952313080430031, "rewards/ndcg_rule_reward": -0.020286413840949535, "rewards/rule_reward": 0.0234375, "step": 31, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.019399818126705062, "grad_norm": 8.879667282104492, "kl": 9.59375, "learning_rate": 3.131313131313132e-06, "loss": 0.0096, "reward": 0.0032851401483640075, "reward_std": 0.0994560644030571, "rewards/ndcg_rule_reward": -0.020152359269559383, "rewards/rule_reward": 0.0234375, "step": 32, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.020006062443164597, "grad_norm": 1.370516300201416, "kl": 0.186279296875, "learning_rate": 3.232323232323233e-06, "loss": 0.0002, "reward": 0.004284637980163097, "reward_std": 0.12423883751034737, "rewards/ndcg_rule_reward": -0.025012237951159477, "rewards/rule_reward": 0.029296875, "step": 33, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.020612306759624128, "grad_norm": 12.281739234924316, "kl": 43.84375, "learning_rate": 3.3333333333333333e-06, "loss": 0.0437, "reward": 0.003274939488619566, "reward_std": 0.10785181075334549, "rewards/ndcg_rule_reward": -0.02211568597704172, "rewards/rule_reward": 0.025390625, "step": 34, "token_diversity": 0.55078125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.021218551076083662, "grad_norm": 1.174424171447754, "kl": 2.671875, "learning_rate": 3.4343434343434347e-06, "loss": 0.0027, "reward": 0.0042901840060949326, "reward_std": 0.14109846204519272, "rewards/ndcg_rule_reward": -0.028912940993905067, "rewards/rule_reward": 0.033203125, "step": 35, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 4.970703125, "epoch": 0.021824795392543193, "grad_norm": 11.309239387512207, "kl": 23.9375, "learning_rate": 3.5353535353535356e-06, "loss": 0.0239, "reward": 0.0038860075874254107, "reward_std": 0.11602924391627312, "rewards/ndcg_rule_reward": -0.023457743227481842, "rewards/rule_reward": 0.02734375, "step": 36, "token_diversity": 0.44340625 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.022431039709002728, "grad_norm": 1.1431318521499634, "kl": 1.7421875, "learning_rate": 3.6363636363636366e-06, "loss": 0.0017, "reward": 0.0024656091118231416, "reward_std": 0.0914391502737999, "rewards/ndcg_rule_reward": -0.019018765538930893, "rewards/rule_reward": 0.021484375, "step": 37, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.023037284025462262, "grad_norm": 1.9237505197525024, "kl": 10.390625, "learning_rate": 3.737373737373738e-06, "loss": 0.0104, "reward": 0.004410707042552531, "reward_std": 0.12416744232177734, "rewards/ndcg_rule_reward": -0.02488616853952408, "rewards/rule_reward": 0.029296875, "step": 38, "token_diversity": 0.50953125 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.023643528341921793, "grad_norm": 1.5106306076049805, "kl": 4.6328125, "learning_rate": 3.8383838383838385e-06, "loss": 0.0046, "reward": 0.0036176673602312803, "reward_std": 0.13293051719665527, "rewards/ndcg_rule_reward": -0.027632332406938076, "rewards/rule_reward": 0.03125, "step": 39, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.024249772658381328, "grad_norm": 20.677881240844727, "kl": 76.1875, "learning_rate": 3.93939393939394e-06, "loss": 0.0762, "reward": 0.0028566549299284816, "reward_std": 0.09126190841197968, "rewards/ndcg_rule_reward": -0.01862772088497877, "rewards/rule_reward": 0.021484375, "step": 40, "token_diversity": 0.47371874999999997 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.024856016974840862, "grad_norm": 1.6925115585327148, "kl": 12.65625, "learning_rate": 4.04040404040404e-06, "loss": 0.0126, "reward": 0.0033038717228919268, "reward_std": 0.11627256125211716, "rewards/ndcg_rule_reward": -0.02403987944126129, "rewards/rule_reward": 0.02734375, "step": 41, "token_diversity": 0.49753125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.025462261291300393, "grad_norm": 0.7900296449661255, "kl": 0.2724609375, "learning_rate": 4.141414141414142e-06, "loss": 0.0003, "reward": 0.002574550628196448, "reward_std": 0.08297721575945616, "rewards/ndcg_rule_reward": -0.016956698498688638, "rewards/rule_reward": 0.01953125, "step": 42, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.026068505607759928, "grad_norm": 1.1741803884506226, "kl": 3.125, "learning_rate": 4.242424242424243e-06, "loss": 0.0031, "reward": 0.003448347095400095, "reward_std": 0.12462522462010384, "rewards/ndcg_rule_reward": -0.02584853023290634, "rewards/rule_reward": 0.029296875, "step": 43, "token_diversity": 0.44675 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.02667474992421946, "grad_norm": 2.276197910308838, "kl": 9.6875, "learning_rate": 4.343434343434344e-06, "loss": 0.0097, "reward": 0.003224264190066606, "reward_std": 0.1078641340136528, "rewards/ndcg_rule_reward": -0.0221663611009717, "rewards/rule_reward": 0.025390625, "step": 44, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.027280994240678993, "grad_norm": 1.6216121912002563, "kl": 7.431640625, "learning_rate": 4.444444444444444e-06, "loss": 0.0074, "reward": 0.0021122837206348777, "reward_std": 0.06632601097226143, "rewards/ndcg_rule_reward": -0.013512716628611088, "rewards/rule_reward": 0.015625, "step": 45, "token_diversity": 0.521625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.027887238557138528, "grad_norm": 1.133737325668335, "kl": 1.259765625, "learning_rate": 4.5454545454545455e-06, "loss": 0.0013, "reward": 0.0036106049083173275, "reward_std": 0.12460819631814957, "rewards/ndcg_rule_reward": -0.02568627055734396, "rewards/rule_reward": 0.029296875, "step": 46, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.02849348287359806, "grad_norm": 1.138932228088379, "kl": 4.8359375, "learning_rate": 4.646464646464647e-06, "loss": 0.0048, "reward": 0.002744482713751495, "reward_std": 0.09971462562680244, "rewards/ndcg_rule_reward": -0.020693017169833183, "rewards/rule_reward": 0.0234375, "step": 47, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.029099727190057594, "grad_norm": 12.40895938873291, "kl": 9.203125, "learning_rate": 4.747474747474748e-06, "loss": 0.0092, "reward": 0.003057865076698363, "reward_std": 0.10798261314630508, "rewards/ndcg_rule_reward": -0.022332760505378246, "rewards/rule_reward": 0.025390625, "step": 48, "token_diversity": 0.474 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.029705971506517128, "grad_norm": 1.979928970336914, "kl": 11.78125, "learning_rate": 4.848484848484849e-06, "loss": 0.0118, "reward": 0.004744670819491148, "reward_std": 0.14085173606872559, "rewards/ndcg_rule_reward": -0.028458453714847565, "rewards/rule_reward": 0.033203125, "step": 49, "token_diversity": 0.40331249999999996 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.03031221582297666, "grad_norm": 1.193053960800171, "kl": 1.3740234375, "learning_rate": 4.94949494949495e-06, "loss": 0.0014, "reward": 0.002771352999843657, "reward_std": 0.10811340063810349, "rewards/ndcg_rule_reward": -0.022619271650910378, "rewards/rule_reward": 0.025390625, "step": 50, "token_diversity": 0.54296875 }, { "categorical_diversity": 1.0, "completion_length": 4.96484375, "epoch": 0.030918460139436194, "grad_norm": 19.968990325927734, "kl": 76.0, "learning_rate": 5.0505050505050515e-06, "loss": 0.076, "reward": 0.0045046822633594275, "reward_std": 0.14939182996749878, "rewards/ndcg_rule_reward": -0.030651569366455078, "rewards/rule_reward": 0.03515625, "step": 51, "token_diversity": 0.52 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.031524704455895725, "grad_norm": 3.1417083740234375, "kl": 14.509765625, "learning_rate": 5.151515151515152e-06, "loss": 0.0145, "reward": 0.003782412735745311, "reward_std": 0.13289064168930054, "rewards/ndcg_rule_reward": -0.027467587031424046, "rewards/rule_reward": 0.03125, "step": 52, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.03213094877235526, "grad_norm": 14.938787460327148, "kl": 34.984375, "learning_rate": 5.252525252525253e-06, "loss": 0.0349, "reward": 0.0031755855306982994, "reward_std": 0.09948765859007835, "rewards/ndcg_rule_reward": -0.0202619144693017, "rewards/rule_reward": 0.0234375, "step": 53, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.032737193088814794, "grad_norm": 105.96237182617188, "kl": 200.8125, "learning_rate": 5.353535353535354e-06, "loss": 0.2009, "reward": 0.00427441275678575, "reward_std": 0.14948837459087372, "rewards/ndcg_rule_reward": -0.030881837010383606, "rewards/rule_reward": 0.03515625, "step": 54, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.033343437405274325, "grad_norm": 1.8183625936508179, "kl": 6.421875, "learning_rate": 5.4545454545454545e-06, "loss": 0.0064, "reward": 0.0034508100943639874, "reward_std": 0.10780049860477448, "rewards/ndcg_rule_reward": -0.021939815022051334, "rewards/rule_reward": 0.025390625, "step": 55, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 4.96484375, "epoch": 0.033949681721733856, "grad_norm": 1.900486707687378, "kl": 5.96875, "learning_rate": 5.555555555555557e-06, "loss": 0.006, "reward": 0.003961636568419635, "reward_std": 0.12437005341053009, "rewards/ndcg_rule_reward": -0.025335238315165043, "rewards/rule_reward": 0.029296875, "step": 56, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.034555926038193394, "grad_norm": 1.1981632709503174, "kl": 3.3203125, "learning_rate": 5.656565656565657e-06, "loss": 0.0033, "reward": 0.0037334166700020432, "reward_std": 0.10764847695827484, "rewards/ndcg_rule_reward": -0.021657208912074566, "rewards/rule_reward": 0.025390625, "step": 57, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.035162170354652925, "grad_norm": 1.5952144861221313, "kl": 7.125, "learning_rate": 5.7575757575757586e-06, "loss": 0.0071, "reward": 0.0022592057939618826, "reward_std": 0.07470864057540894, "rewards/ndcg_rule_reward": -0.015318918973207474, "rewards/rule_reward": 0.017578125, "step": 58, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.035768414671112456, "grad_norm": 0.9707615375518799, "kl": 0.5205078125, "learning_rate": 5.858585858585859e-06, "loss": 0.0005, "reward": 0.0030716261826455593, "reward_std": 0.10795027762651443, "rewards/ndcg_rule_reward": -0.022319000214338303, "rewards/rule_reward": 0.025390625, "step": 59, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.036374658987571994, "grad_norm": 2.5114564895629883, "kl": 13.984375, "learning_rate": 5.95959595959596e-06, "loss": 0.014, "reward": 0.0032994369976222515, "reward_std": 0.10787747800350189, "rewards/ndcg_rule_reward": -0.022091188468039036, "rewards/rule_reward": 0.025390625, "step": 60, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.036980903304031525, "grad_norm": 10.236863136291504, "kl": 42.3125, "learning_rate": 6.060606060606061e-06, "loss": 0.0423, "reward": 0.0038316146237775683, "reward_std": 0.1328699216246605, "rewards/ndcg_rule_reward": -0.027418386191129684, "rewards/rule_reward": 0.03125, "step": 61, "token_diversity": 0.5021875 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.037587147620491056, "grad_norm": 6.111833095550537, "kl": 23.5625, "learning_rate": 6.1616161616161615e-06, "loss": 0.0236, "reward": 0.003233205759897828, "reward_std": 0.10790019482374191, "rewards/ndcg_rule_reward": -0.022157419472932816, "rewards/rule_reward": 0.025390625, "step": 62, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.038193391936950594, "grad_norm": 2.6106460094451904, "kl": 13.21875, "learning_rate": 6.262626262626264e-06, "loss": 0.0133, "reward": 0.004641757346689701, "reward_std": 0.15771730244159698, "rewards/ndcg_rule_reward": -0.032467618584632874, "rewards/rule_reward": 0.037109375, "step": 63, "token_diversity": 0.42321875 }, { "categorical_diversity": 1.0, "completion_length": 4.96484375, "epoch": 0.038799636253410125, "grad_norm": 3.9326419830322266, "kl": 9.59375, "learning_rate": 6.363636363636364e-06, "loss": 0.0096, "reward": 0.0039002780104056, "reward_std": 0.11599454283714294, "rewards/ndcg_rule_reward": -0.023443471640348434, "rewards/rule_reward": 0.02734375, "step": 64, "token_diversity": 0.492 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.039405880569869656, "grad_norm": 1.1352834701538086, "kl": 8.65625, "learning_rate": 6.464646464646466e-06, "loss": 0.0087, "reward": 0.002973293769173324, "reward_std": 0.09958279877901077, "rewards/ndcg_rule_reward": -0.02046420518308878, "rewards/rule_reward": 0.0234375, "step": 65, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 4.958984375, "epoch": 0.040012124886329194, "grad_norm": 30.724159240722656, "kl": 4.609375, "learning_rate": 6.565656565656566e-06, "loss": 0.0046, "reward": 0.003767924150452018, "reward_std": 0.12446090579032898, "rewards/ndcg_rule_reward": -0.02552895061671734, "rewards/rule_reward": 0.029296875, "step": 66, "token_diversity": 0.41521874999999997 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.040618369202788725, "grad_norm": 1.2954877614974976, "kl": 5.2109375, "learning_rate": 6.666666666666667e-06, "loss": 0.0052, "reward": 0.002440905023831874, "reward_std": 0.07462366297841072, "rewards/ndcg_rule_reward": -0.015137220732867718, "rewards/rule_reward": 0.017578125, "step": 67, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.041224613519248256, "grad_norm": 1.2979233264923096, "kl": 1.30078125, "learning_rate": 6.767676767676769e-06, "loss": 0.0013, "reward": 0.0030781251844018698, "reward_std": 0.1163739487528801, "rewards/ndcg_rule_reward": -0.024265624582767487, "rewards/rule_reward": 0.02734375, "step": 68, "token_diversity": 0.5390625 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.041830857835707794, "grad_norm": 3.2234599590301514, "kl": 14.703125, "learning_rate": 6.868686868686869e-06, "loss": 0.0147, "reward": 0.003410794190131128, "reward_std": 0.11620849370956421, "rewards/ndcg_rule_reward": -0.023932956159114838, "rewards/rule_reward": 0.02734375, "step": 69, "token_diversity": 0.521625 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.042437102152167325, "grad_norm": 3.1736481189727783, "kl": 19.33203125, "learning_rate": 6.969696969696971e-06, "loss": 0.0194, "reward": 0.0028618655633181334, "reward_std": 0.0912199430167675, "rewards/ndcg_rule_reward": -0.018622509203851223, "rewards/rule_reward": 0.021484375, "step": 70, "token_diversity": 0.49409375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.043043346468626856, "grad_norm": 1.045681118965149, "kl": 0.896484375, "learning_rate": 7.070707070707071e-06, "loss": 0.0009, "reward": 0.0018174280994571745, "reward_std": 0.08330833166837692, "rewards/ndcg_rule_reward": -0.017713822424411774, "rewards/rule_reward": 0.01953125, "step": 71, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 4.96484375, "epoch": 0.04364959078508639, "grad_norm": 1.6554479598999023, "kl": 3.953125, "learning_rate": 7.171717171717172e-06, "loss": 0.004, "reward": 0.003971070982515812, "reward_std": 0.12436038255691528, "rewards/ndcg_rule_reward": -0.025325804017484188, "rewards/rule_reward": 0.029296875, "step": 72, "token_diversity": 0.43390625 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.044255835101545925, "grad_norm": 1.2115213871002197, "kl": 3.1875, "learning_rate": 7.272727272727273e-06, "loss": 0.0032, "reward": 0.0029134098440408707, "reward_std": 0.11646150052547455, "rewards/ndcg_rule_reward": -0.02443034015595913, "rewards/rule_reward": 0.02734375, "step": 73, "token_diversity": 0.396 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.044862079418005456, "grad_norm": 2.2746400833129883, "kl": 8.017578125, "learning_rate": 7.373737373737374e-06, "loss": 0.008, "reward": 0.0021807224838994443, "reward_std": 0.07474218867719173, "rewards/ndcg_rule_reward": -0.015397402923554182, "rewards/rule_reward": 0.017578125, "step": 74, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 4.982421875, "epoch": 0.04546832373446499, "grad_norm": 3.80625581741333, "kl": 20.77734375, "learning_rate": 7.474747474747476e-06, "loss": 0.0208, "reward": 0.0033867552410811186, "reward_std": 0.10782413929700851, "rewards/ndcg_rule_reward": -0.022003870457410812, "rewards/rule_reward": 0.025390625, "step": 75, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.046074568050924525, "grad_norm": 10.783246040344238, "kl": 43.78125, "learning_rate": 7.5757575757575764e-06, "loss": 0.0438, "reward": 0.0027930361684411764, "reward_std": 0.09127305075526237, "rewards/ndcg_rule_reward": -0.018691339530050755, "rewards/rule_reward": 0.021484375, "step": 76, "token_diversity": 0.47371874999999997 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.046680812367384056, "grad_norm": 6.680273056030273, "kl": 27.65625, "learning_rate": 7.676767676767677e-06, "loss": 0.0276, "reward": 0.002550451667048037, "reward_std": 0.09981150180101395, "rewards/ndcg_rule_reward": -0.020887047983705997, "rewards/rule_reward": 0.0234375, "step": 77, "token_diversity": 0.44665625 }, { "categorical_diversity": 1.0, "completion_length": 5.05859375, "epoch": 0.04728705668384359, "grad_norm": 1.4827557802200317, "kl": 6.671875, "learning_rate": 7.77777777777778e-06, "loss": 0.0067, "reward": 0.004080690094269812, "reward_std": 0.14958716928958893, "rewards/ndcg_rule_reward": -0.031075559556484222, "rewards/rule_reward": 0.03515625, "step": 78, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 4.970703125, "epoch": 0.047893301000303125, "grad_norm": 1.9856704473495483, "kl": 6.28125, "learning_rate": 7.87878787878788e-06, "loss": 0.0063, "reward": 0.004723605467006564, "reward_std": 0.14929447323083878, "rewards/ndcg_rule_reward": -0.03043264616280794, "rewards/rule_reward": 0.03515625, "step": 79, "token_diversity": 0.42571875000000003 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.048499545316762656, "grad_norm": 2.3475756645202637, "kl": 14.1875, "learning_rate": 7.97979797979798e-06, "loss": 0.0142, "reward": 0.0027173429261893034, "reward_std": 0.10812755674123764, "rewards/ndcg_rule_reward": -0.02267328090965748, "rewards/rule_reward": 0.025390625, "step": 80, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.361328125, "epoch": 0.04910578963322219, "grad_norm": 1.7982308864593506, "kl": 2.296875, "learning_rate": 8.08080808080808e-06, "loss": 0.0023, "reward": 0.002920576254837215, "reward_std": 0.10805945098400116, "rewards/ndcg_rule_reward": -0.022470048628747463, "rewards/rule_reward": 0.025390625, "step": 81, "token_diversity": 0.48228125 }, { "categorical_diversity": 1.0, "completion_length": 5.560546875, "epoch": 0.049712033949681725, "grad_norm": 2.6818346977233887, "kl": 19.3125, "learning_rate": 8.181818181818183e-06, "loss": 0.0193, "reward": 0.0020137044484727085, "reward_std": 0.07478652149438858, "rewards/ndcg_rule_reward": -0.015564420726150274, "rewards/rule_reward": 0.017578125, "step": 82, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.029296875, "epoch": 0.050318278266141256, "grad_norm": 29.689342498779297, "kl": 130.5, "learning_rate": 8.282828282828283e-06, "loss": 0.1311, "reward": 0.004399397876113653, "reward_std": 0.14099986106157303, "rewards/ndcg_rule_reward": -0.028803727589547634, "rewards/rule_reward": 0.033203125, "step": 83, "token_diversity": 0.4701875 }, { "categorical_diversity": 1.0, "completion_length": 5.962890625, "epoch": 0.05092452258260079, "grad_norm": 3.5635581016540527, "kl": 24.28125, "learning_rate": 8.383838383838384e-06, "loss": 0.0243, "reward": 0.003691555466502905, "reward_std": 0.10768803209066391, "rewards/ndcg_rule_reward": -0.021699069999158382, "rewards/rule_reward": 0.025390625, "step": 84, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0234375, "epoch": 0.05153076689906032, "grad_norm": 1.41123628616333, "kl": 2.85546875, "learning_rate": 8.484848484848486e-06, "loss": 0.0028, "reward": 0.002700603276025504, "reward_std": 0.10815594345331192, "rewards/ndcg_rule_reward": -0.02269002189859748, "rewards/rule_reward": 0.025390625, "step": 85, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.26171875, "epoch": 0.052137011215519856, "grad_norm": 1.4707138538360596, "kl": 9.25, "learning_rate": 8.585858585858587e-06, "loss": 0.0092, "reward": 0.0038842095527797937, "reward_std": 0.12443472072482109, "rewards/ndcg_rule_reward": -0.02541266568005085, "rewards/rule_reward": 0.029296875, "step": 86, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.162109375, "epoch": 0.05274325553197939, "grad_norm": 1.3242831230163574, "kl": 5.34375, "learning_rate": 8.686868686868687e-06, "loss": 0.0053, "reward": 0.003143581096082926, "reward_std": 0.10796012729406357, "rewards/ndcg_rule_reward": -0.022247043438255787, "rewards/rule_reward": 0.025390625, "step": 87, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 4.9765625, "epoch": 0.05334949984843892, "grad_norm": 5.821344375610352, "kl": 45.625, "learning_rate": 8.787878787878788e-06, "loss": 0.0456, "reward": 0.002971199806779623, "reward_std": 0.10800957679748535, "rewards/ndcg_rule_reward": -0.02241942472755909, "rewards/rule_reward": 0.025390625, "step": 88, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.05859375, "epoch": 0.053955744164898456, "grad_norm": 2.608046770095825, "kl": 22.34375, "learning_rate": 8.888888888888888e-06, "loss": 0.0223, "reward": 0.004287739167921245, "reward_std": 0.12423853576183319, "rewards/ndcg_rule_reward": -0.025009135715663433, "rewards/rule_reward": 0.029296875, "step": 89, "token_diversity": 0.388 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.05456198848135799, "grad_norm": 1.8069734573364258, "kl": 13.484375, "learning_rate": 8.98989898989899e-06, "loss": 0.0135, "reward": 0.004062265157699585, "reward_std": 0.13276611641049385, "rewards/ndcg_rule_reward": -0.02718773577362299, "rewards/rule_reward": 0.03125, "step": 90, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.1484375, "epoch": 0.05516823279781752, "grad_norm": 1.100334644317627, "kl": 7.734375, "learning_rate": 9.090909090909091e-06, "loss": 0.0077, "reward": 0.0023228502832353115, "reward_std": 0.0662580169737339, "rewards/ndcg_rule_reward": -0.013302149716764688, "rewards/rule_reward": 0.015625, "step": 91, "token_diversity": 0.43865624999999997 }, { "categorical_diversity": 1.0, "completion_length": 5.048828125, "epoch": 0.055774477114277056, "grad_norm": 59.84614944458008, "kl": 243.5, "learning_rate": 9.191919191919193e-06, "loss": 0.2437, "reward": 0.0038595666410401464, "reward_std": 0.1413056179881096, "rewards/ndcg_rule_reward": -0.029343558475375175, "rewards/rule_reward": 0.033203125, "step": 92, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.05638072143073659, "grad_norm": 2.0432283878326416, "kl": 27.3125, "learning_rate": 9.292929292929294e-06, "loss": 0.0273, "reward": 0.002930198097601533, "reward_std": 0.09963022172451019, "rewards/ndcg_rule_reward": -0.02050730213522911, "rewards/rule_reward": 0.0234375, "step": 93, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.05698696574719612, "grad_norm": 4.23698616027832, "kl": 34.5625, "learning_rate": 9.393939393939396e-06, "loss": 0.0346, "reward": 0.0025646473513916135, "reward_std": 0.08298399299383163, "rewards/ndcg_rule_reward": -0.01696660276502371, "rewards/rule_reward": 0.01953125, "step": 94, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.057593210063655656, "grad_norm": 1.4521992206573486, "kl": 6.3046875, "learning_rate": 9.494949494949497e-06, "loss": 0.0063, "reward": 0.0035879533970728517, "reward_std": 0.12457870692014694, "rewards/ndcg_rule_reward": -0.025708922185003757, "rewards/rule_reward": 0.029296875, "step": 95, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.05819945438011519, "grad_norm": 1.968686580657959, "kl": 21.65625, "learning_rate": 9.595959595959597e-06, "loss": 0.0217, "reward": 0.004346583504229784, "reward_std": 0.14944422245025635, "rewards/ndcg_rule_reward": -0.03080966603010893, "rewards/rule_reward": 0.03515625, "step": 96, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.05880569869657472, "grad_norm": 0.7691456079483032, "kl": 3.6015625, "learning_rate": 9.696969696969698e-06, "loss": 0.0036, "reward": 0.0015653088921681046, "reward_std": 0.05819664150476456, "rewards/ndcg_rule_reward": -0.012106566689908504, "rewards/rule_reward": 0.013671875, "step": 97, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.03125, "epoch": 0.059411943013034256, "grad_norm": 1.8538217544555664, "kl": 10.8125, "learning_rate": 9.797979797979798e-06, "loss": 0.0108, "reward": 0.003947780700400472, "reward_std": 0.12440408393740654, "rewards/ndcg_rule_reward": -0.025349094532430172, "rewards/rule_reward": 0.029296875, "step": 98, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.046875, "epoch": 0.06001818732949379, "grad_norm": 1.6196844577789307, "kl": 4.484375, "learning_rate": 9.8989898989899e-06, "loss": 0.0045, "reward": 0.0035773604176938534, "reward_std": 0.12458128854632378, "rewards/ndcg_rule_reward": -0.02571951597929001, "rewards/rule_reward": 0.029296875, "step": 99, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06062443164595332, "grad_norm": 1.11077082157135, "kl": 3.9580078125, "learning_rate": 1e-05, "loss": 0.004, "reward": 0.0026919743977487087, "reward_std": 0.11659354716539383, "rewards/ndcg_rule_reward": -0.024651776999235153, "rewards/rule_reward": 0.02734375, "step": 100, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 4.99609375, "epoch": 0.06123067596241285, "grad_norm": 1.4386545419692993, "kl": 13.21875, "learning_rate": 9.999997591934084e-06, "loss": 0.0132, "reward": 0.0031170203583315015, "reward_std": 0.09110834449529648, "rewards/ndcg_rule_reward": -0.01836735475808382, "rewards/rule_reward": 0.021484375, "step": 101, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.015625, "epoch": 0.06183692027887239, "grad_norm": 1.3184139728546143, "kl": 10.53125, "learning_rate": 9.999990367738653e-06, "loss": 0.0105, "reward": 0.0017437389469705522, "reward_std": 0.06653835251927376, "rewards/ndcg_rule_reward": -0.013881261460483074, "rewards/rule_reward": 0.015625, "step": 102, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06244316459533192, "grad_norm": 1.385664939880371, "kl": 4.578125, "learning_rate": 9.999978327420663e-06, "loss": 0.0046, "reward": 0.0029380694031715393, "reward_std": 0.10805146768689156, "rewards/ndcg_rule_reward": -0.02245255559682846, "rewards/rule_reward": 0.025390625, "step": 103, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06304940891179145, "grad_norm": 208.97705078125, "kl": 957.8125, "learning_rate": 9.999961470991716e-06, "loss": 0.9635, "reward": 0.0037782287690788507, "reward_std": 0.13292192667722702, "rewards/ndcg_rule_reward": -0.027471771463751793, "rewards/rule_reward": 0.03125, "step": 104, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06365565322825098, "grad_norm": 31.85740852355957, "kl": 134.375, "learning_rate": 9.999939798468046e-06, "loss": 0.1344, "reward": 0.0036350685404613614, "reward_std": 0.12457232922315598, "rewards/ndcg_rule_reward": -0.02566180657595396, "rewards/rule_reward": 0.029296875, "step": 105, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 4.98828125, "epoch": 0.06426189754471053, "grad_norm": 4.438607215881348, "kl": 38.0, "learning_rate": 9.99991330987053e-06, "loss": 0.0381, "reward": 0.004312942270189524, "reward_std": 0.14945737272500992, "rewards/ndcg_rule_reward": -0.030843306332826614, "rewards/rule_reward": 0.03515625, "step": 106, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.154296875, "epoch": 0.06486814186117006, "grad_norm": 56.77813720703125, "kl": 107.75, "learning_rate": 9.999882005224682e-06, "loss": 0.1079, "reward": 0.004558513173833489, "reward_std": 0.14095086604356766, "rewards/ndcg_rule_reward": -0.028644612058997154, "rewards/rule_reward": 0.033203125, "step": 107, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06547438617762959, "grad_norm": 1.3982219696044922, "kl": 6.59375, "learning_rate": 9.999845884560654e-06, "loss": 0.0066, "reward": 0.002154433459509164, "reward_std": 0.08317948877811432, "rewards/ndcg_rule_reward": -0.017376816365867853, "rewards/rule_reward": 0.01953125, "step": 108, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.06608063049408912, "grad_norm": 10.251948356628418, "kl": 71.25, "learning_rate": 9.999804947913241e-06, "loss": 0.0715, "reward": 0.0030952432425692677, "reward_std": 0.12482987344264984, "rewards/ndcg_rule_reward": -0.026201631873846054, "rewards/rule_reward": 0.029296875, "step": 109, "token_diversity": 0.49334374999999997 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06668687481054865, "grad_norm": 1.5216277837753296, "kl": 11.0, "learning_rate": 9.999759195321872e-06, "loss": 0.011, "reward": 0.003257556352764368, "reward_std": 0.10789394751191139, "rewards/ndcg_rule_reward": -0.02213306911289692, "rewards/rule_reward": 0.025390625, "step": 110, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06729311912700818, "grad_norm": 2.815002679824829, "kl": 27.625, "learning_rate": 9.999708626830617e-06, "loss": 0.0276, "reward": 0.003968127886764705, "reward_std": 0.13280623778700829, "rewards/ndcg_rule_reward": -0.027281872928142548, "rewards/rule_reward": 0.03125, "step": 111, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.068359375, "epoch": 0.06789936344346771, "grad_norm": 1.0675429105758667, "kl": 9.6875, "learning_rate": 9.999653242488187e-06, "loss": 0.0097, "reward": 0.002387779881246388, "reward_std": 0.08303026854991913, "rewards/ndcg_rule_reward": -0.017143470235168934, "rewards/rule_reward": 0.01953125, "step": 112, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06850560775992726, "grad_norm": 1.6856718063354492, "kl": 24.25, "learning_rate": 9.99959304234793e-06, "loss": 0.0242, "reward": 0.0030398350208997726, "reward_std": 0.09956571459770203, "rewards/ndcg_rule_reward": -0.020397664979100227, "rewards/rule_reward": 0.0234375, "step": 113, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06911185207638679, "grad_norm": 1.4351823329925537, "kl": 12.40625, "learning_rate": 9.99952802646783e-06, "loss": 0.0124, "reward": 0.002681054756976664, "reward_std": 0.11656617373228073, "rewards/ndcg_rule_reward": -0.02466269489377737, "rewards/rule_reward": 0.02734375, "step": 114, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.06971809639284632, "grad_norm": 2.2399749755859375, "kl": 21.8125, "learning_rate": 9.999458194910512e-06, "loss": 0.0218, "reward": 0.0032584378495812416, "reward_std": 0.12472138553857803, "rewards/ndcg_rule_reward": -0.02603843715041876, "rewards/rule_reward": 0.029296875, "step": 115, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07032434070930585, "grad_norm": 1.2986693382263184, "kl": 7.93359375, "learning_rate": 9.999383547743242e-06, "loss": 0.0079, "reward": 0.002675711177289486, "reward_std": 0.09131237491965294, "rewards/ndcg_rule_reward": -0.018808663822710514, "rewards/rule_reward": 0.021484375, "step": 116, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07093058502576538, "grad_norm": 2.5954174995422363, "kl": 28.89453125, "learning_rate": 9.99930408503792e-06, "loss": 0.0289, "reward": 0.002379111188929528, "reward_std": 0.09147680550813675, "rewards/ndcg_rule_reward": -0.019105263520032167, "rewards/rule_reward": 0.021484375, "step": 117, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07153682934222491, "grad_norm": 3.6854915618896484, "kl": 22.9375, "learning_rate": 9.999219806871086e-06, "loss": 0.0229, "reward": 0.0030531653901562095, "reward_std": 0.0995555967092514, "rewards/ndcg_rule_reward": -0.020384333562105894, "rewards/rule_reward": 0.0234375, "step": 118, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07214307365868446, "grad_norm": 1.9084738492965698, "kl": 19.78125, "learning_rate": 9.999130713323922e-06, "loss": 0.0198, "reward": 0.0039125209441408515, "reward_std": 0.12441373616456985, "rewards/ndcg_rule_reward": -0.025384354405105114, "rewards/rule_reward": 0.029296875, "step": 119, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07274931797514399, "grad_norm": 3.537377119064331, "kl": 25.84375, "learning_rate": 9.99903680448224e-06, "loss": 0.0258, "reward": 0.0028625268023461103, "reward_std": 0.10804607346653938, "rewards/ndcg_rule_reward": -0.02252809703350067, "rewards/rule_reward": 0.025390625, "step": 120, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.09375, "epoch": 0.07335556229160352, "grad_norm": 1.1645773649215698, "kl": 4.03125, "learning_rate": 9.998938080436503e-06, "loss": 0.004, "reward": 0.002319251303561032, "reward_std": 0.08305169641971588, "rewards/ndcg_rule_reward": -0.017211999744176865, "rewards/rule_reward": 0.01953125, "step": 121, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.150390625, "epoch": 0.07396180660806305, "grad_norm": 1.506544589996338, "kl": 12.0, "learning_rate": 9.998834541281798e-06, "loss": 0.012, "reward": 0.004175003035925329, "reward_std": 0.1326926164329052, "rewards/ndcg_rule_reward": -0.027074997313320637, "rewards/rule_reward": 0.03125, "step": 122, "token_diversity": 0.36314786585365855 }, { "categorical_diversity": 1.0, "completion_length": 5.0078125, "epoch": 0.07456805092452258, "grad_norm": 1.616723656654358, "kl": 16.203125, "learning_rate": 9.998726187117863e-06, "loss": 0.0162, "reward": 0.0028556882170960307, "reward_std": 0.09968499466776848, "rewards/ndcg_rule_reward": -0.020581811666488647, "rewards/rule_reward": 0.0234375, "step": 123, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07517429524098211, "grad_norm": 1.467658281326294, "kl": 7.42578125, "learning_rate": 9.99861301804906e-06, "loss": 0.0074, "reward": 0.0036726004909723997, "reward_std": 0.13298653811216354, "rewards/ndcg_rule_reward": -0.027577398344874382, "rewards/rule_reward": 0.03125, "step": 124, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07578053955744164, "grad_norm": 1.7592790126800537, "kl": 19.625, "learning_rate": 9.9984950341844e-06, "loss": 0.0196, "reward": 0.00242888240609318, "reward_std": 0.0914539285004139, "rewards/ndcg_rule_reward": -0.01905549317598343, "rewards/rule_reward": 0.021484375, "step": 125, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07638678387390119, "grad_norm": 1.5834071636199951, "kl": 14.15625, "learning_rate": 9.99837223563753e-06, "loss": 0.0142, "reward": 0.0020160318817943335, "reward_std": 0.07484462670981884, "rewards/ndcg_rule_reward": -0.015562093816697598, "rewards/rule_reward": 0.017578125, "step": 126, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07699302819036072, "grad_norm": 9.99822998046875, "kl": 50.46875, "learning_rate": 9.99824462252673e-06, "loss": 0.0505, "reward": 0.0026846008840948343, "reward_std": 0.08293991163372993, "rewards/ndcg_rule_reward": -0.016846650280058384, "rewards/rule_reward": 0.01953125, "step": 127, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07759927250682025, "grad_norm": 8.654129028320312, "kl": 30.25, "learning_rate": 9.998112194974922e-06, "loss": 0.0303, "reward": 0.0027267339173704386, "reward_std": 0.08289271593093872, "rewards/ndcg_rule_reward": -0.01680451724678278, "rewards/rule_reward": 0.01953125, "step": 128, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07820551682327978, "grad_norm": 4.277270793914795, "kl": 58.25, "learning_rate": 9.997974953109664e-06, "loss": 0.058, "reward": 0.003042236319743097, "reward_std": 0.09956573694944382, "rewards/ndcg_rule_reward": -0.02039526216685772, "rewards/rule_reward": 0.0234375, "step": 129, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07881176113973931, "grad_norm": 2.242607831954956, "kl": 34.875, "learning_rate": 9.997832897063148e-06, "loss": 0.0348, "reward": 0.003290464635938406, "reward_std": 0.0994572564959526, "rewards/ndcg_rule_reward": -0.02014703582972288, "rewards/rule_reward": 0.0234375, "step": 130, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.07941800545619884, "grad_norm": 1.3606550693511963, "kl": 16.71875, "learning_rate": 9.99768602697221e-06, "loss": 0.0167, "reward": 0.003358633955940604, "reward_std": 0.09100349247455597, "rewards/ndcg_rule_reward": -0.018125740811228752, "rewards/rule_reward": 0.021484375, "step": 131, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08002424977265839, "grad_norm": 1.2290043830871582, "kl": 10.65625, "learning_rate": 9.997534342978316e-06, "loss": 0.0106, "reward": 0.002417540003079921, "reward_std": 0.07459459826350212, "rewards/ndcg_rule_reward": -0.015160584822297096, "rewards/rule_reward": 0.017578125, "step": 132, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08063049408911792, "grad_norm": 5.9420599937438965, "kl": 25.7578125, "learning_rate": 9.997377845227577e-06, "loss": 0.0258, "reward": 0.0029226152691990137, "reward_std": 0.09119580686092377, "rewards/ndcg_rule_reward": -0.01856175996363163, "rewards/rule_reward": 0.021484375, "step": 133, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.08123673840557745, "grad_norm": 1.9829161167144775, "kl": 14.625, "learning_rate": 9.997216533870728e-06, "loss": 0.0146, "reward": 0.004498742520809174, "reward_std": 0.14092668145895004, "rewards/ndcg_rule_reward": -0.028704382479190826, "rewards/rule_reward": 0.033203125, "step": 134, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08184298272203698, "grad_norm": 1.2289100885391235, "kl": 5.96875, "learning_rate": 9.997050409063153e-06, "loss": 0.006, "reward": 0.002496803062967956, "reward_std": 0.09982331469655037, "rewards/ndcg_rule_reward": -0.020940696820616722, "rewards/rule_reward": 0.0234375, "step": 135, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0078125, "epoch": 0.08244922703849651, "grad_norm": 5.935653209686279, "kl": 37.640625, "learning_rate": 9.996879470964869e-06, "loss": 0.0377, "reward": 0.004462102428078651, "reward_std": 0.1325777992606163, "rewards/ndcg_rule_reward": -0.02678789757192135, "rewards/rule_reward": 0.03125, "step": 136, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08305547135495604, "grad_norm": 2.373534679412842, "kl": 19.26171875, "learning_rate": 9.996703719740526e-06, "loss": 0.0193, "reward": 0.0037629324942827225, "reward_std": 0.13288761675357819, "rewards/ndcg_rule_reward": -0.027487067505717278, "rewards/rule_reward": 0.03125, "step": 137, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.09375, "epoch": 0.08366171567141559, "grad_norm": 16.80321502685547, "kl": 71.25, "learning_rate": 9.996523155559413e-06, "loss": 0.0712, "reward": 0.0037768763722851872, "reward_std": 0.1160569041967392, "rewards/ndcg_rule_reward": -0.023566873744130135, "rewards/rule_reward": 0.02734375, "step": 138, "token_diversity": 0.38955965909090906 }, { "categorical_diversity": 1.0, "completion_length": 5.169921875, "epoch": 0.08426795998787512, "grad_norm": 11.46741771697998, "kl": 86.0, "learning_rate": 9.996337778595452e-06, "loss": 0.0862, "reward": 0.004862775094807148, "reward_std": 0.14078430086374283, "rewards/ndcg_rule_reward": -0.028340352699160576, "rewards/rule_reward": 0.033203125, "step": 139, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08487420430433465, "grad_norm": 2.2883706092834473, "kl": 29.5625, "learning_rate": 9.996147589027207e-06, "loss": 0.0296, "reward": 0.004882521694526076, "reward_std": 0.16606505215168, "rewards/ndcg_rule_reward": -0.03417997807264328, "rewards/rule_reward": 0.0390625, "step": 140, "token_diversity": 0.359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08548044862079418, "grad_norm": 2.0223405361175537, "kl": 44.6875, "learning_rate": 9.995952587037872e-06, "loss": 0.0447, "reward": 0.0039000253891572356, "reward_std": 0.11602526530623436, "rewards/ndcg_rule_reward": -0.02344372496008873, "rewards/rule_reward": 0.02734375, "step": 141, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.08608669293725371, "grad_norm": 2.1055638790130615, "kl": 38.25, "learning_rate": 9.995752772815275e-06, "loss": 0.0382, "reward": 0.004205905017443001, "reward_std": 0.12426484376192093, "rewards/ndcg_rule_reward": -0.02509097009897232, "rewards/rule_reward": 0.029296875, "step": 142, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.041015625, "epoch": 0.08669293725371324, "grad_norm": 2.69620418548584, "kl": 40.8125, "learning_rate": 9.995548146551886e-06, "loss": 0.0409, "reward": 0.004805097822099924, "reward_std": 0.14923329651355743, "rewards/ndcg_rule_reward": -0.030351154506206512, "rewards/rule_reward": 0.03515625, "step": 143, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.583984375, "epoch": 0.08729918157017277, "grad_norm": 1.456287145614624, "kl": 9.21875, "learning_rate": 9.995338708444804e-06, "loss": 0.0092, "reward": 0.0024425541050732136, "reward_std": 0.09984105825424194, "rewards/ndcg_rule_reward": -0.0209949454292655, "rewards/rule_reward": 0.0234375, "step": 144, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.24609375, "epoch": 0.08790542588663232, "grad_norm": 1.7118057012557983, "kl": 24.25, "learning_rate": 9.995124458695769e-06, "loss": 0.0242, "reward": 0.002522778057027608, "reward_std": 0.07456038892269135, "rewards/ndcg_rule_reward": -0.015055347234010696, "rewards/rule_reward": 0.017578125, "step": 145, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.259765625, "epoch": 0.08851167020309185, "grad_norm": 0.949589192867279, "kl": 3.984375, "learning_rate": 9.994905397511148e-06, "loss": 0.004, "reward": 0.002375552197918296, "reward_std": 0.09147072583436966, "rewards/ndcg_rule_reward": -0.01910882256925106, "rewards/rule_reward": 0.021484375, "step": 146, "token_diversity": 0.3300856082375479 }, { "categorical_diversity": 1.0, "completion_length": 5.193359375, "epoch": 0.08911791451955138, "grad_norm": 4.305733680725098, "kl": 17.75, "learning_rate": 9.994681525101947e-06, "loss": 0.0177, "reward": 0.003580257878638804, "reward_std": 0.17506562173366547, "rewards/ndcg_rule_reward": -0.037435369566082954, "rewards/rule_reward": 0.041015625, "step": 147, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.078125, "epoch": 0.08972415883601091, "grad_norm": 1.2409204244613647, "kl": 6.2890625, "learning_rate": 9.994452841683808e-06, "loss": 0.0063, "reward": 0.002378991339355707, "reward_std": 0.0914706140756607, "rewards/ndcg_rule_reward": -0.019105383194983006, "rewards/rule_reward": 0.021484375, "step": 148, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.12890625, "epoch": 0.09033040315247044, "grad_norm": 2.5432255268096924, "kl": 13.625, "learning_rate": 9.994219347477003e-06, "loss": 0.0136, "reward": 0.0037492028204724193, "reward_std": 0.11607073247432709, "rewards/ndcg_rule_reward": -0.02359454706311226, "rewards/rule_reward": 0.02734375, "step": 149, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.142578125, "epoch": 0.09093664746892997, "grad_norm": 34.823909759521484, "kl": 108.125, "learning_rate": 9.993981042706442e-06, "loss": 0.1078, "reward": 0.0019192195613868535, "reward_std": 0.05802551656961441, "rewards/ndcg_rule_reward": -0.011752655729651451, "rewards/rule_reward": 0.013671875, "step": 150, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.30078125, "epoch": 0.09154289178538952, "grad_norm": 1.6136000156402588, "kl": 9.1875, "learning_rate": 9.993737927601663e-06, "loss": 0.0092, "reward": 0.0039145376067608595, "reward_std": 0.15808237344026566, "rewards/ndcg_rule_reward": -0.03319483622908592, "rewards/rule_reward": 0.037109375, "step": 151, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.05859375, "epoch": 0.09214913610184905, "grad_norm": 14.56311321258545, "kl": 59.5703125, "learning_rate": 9.993490002396843e-06, "loss": 0.0595, "reward": 0.0035244515165686607, "reward_std": 0.11617103964090347, "rewards/ndcg_rule_reward": -0.02381929848343134, "rewards/rule_reward": 0.02734375, "step": 152, "token_diversity": 0.38498813291139244 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.09275538041830858, "grad_norm": 3.0569427013397217, "kl": 31.125, "learning_rate": 9.993237267330792e-06, "loss": 0.031, "reward": 0.004398143035359681, "reward_std": 0.13259918987751007, "rewards/ndcg_rule_reward": -0.02685185708105564, "rewards/rule_reward": 0.03125, "step": 153, "token_diversity": 0.43856249999999997 }, { "categorical_diversity": 1.0, "completion_length": 5.068359375, "epoch": 0.09336162473476811, "grad_norm": 1.0174989700317383, "kl": 8.21875, "learning_rate": 9.992979722646948e-06, "loss": 0.0082, "reward": 0.0017878186190500855, "reward_std": 0.06649030931293964, "rewards/ndcg_rule_reward": -0.013837182428687811, "rewards/rule_reward": 0.015625, "step": 154, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.017578125, "epoch": 0.09396786905122764, "grad_norm": 1.9761266708374023, "kl": 31.5625, "learning_rate": 9.992717368593385e-06, "loss": 0.0315, "reward": 0.004072306212037802, "reward_std": 0.13273393362760544, "rewards/ndcg_rule_reward": -0.02717769332230091, "rewards/rule_reward": 0.03125, "step": 155, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.09457411336768717, "grad_norm": 0.8171387314796448, "kl": 4.9453125, "learning_rate": 9.992450205422813e-06, "loss": 0.0049, "reward": 0.0008284933865070343, "reward_std": 0.03326539043337107, "rewards/ndcg_rule_reward": -0.006984006613492966, "rewards/rule_reward": 0.0078125, "step": 156, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.146484375, "epoch": 0.0951803576841467, "grad_norm": 4.237576007843018, "kl": 49.375, "learning_rate": 9.992178233392565e-06, "loss": 0.0493, "reward": 0.0036992899840697646, "reward_std": 0.14137650281190872, "rewards/ndcg_rule_reward": -0.029503834433853626, "rewards/rule_reward": 0.033203125, "step": 157, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.09578660200060625, "grad_norm": 1.4716662168502808, "kl": 18.4375, "learning_rate": 9.991901452764614e-06, "loss": 0.0185, "reward": 0.0031080583576112986, "reward_std": 0.09954005479812622, "rewards/ndcg_rule_reward": -0.020329441875219345, "rewards/rule_reward": 0.0234375, "step": 158, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.26953125, "epoch": 0.09639284631706578, "grad_norm": 5.487523555755615, "kl": 34.40625, "learning_rate": 9.991619863805565e-06, "loss": 0.0344, "reward": 0.0032443798845633864, "reward_std": 0.0994680430740118, "rewards/ndcg_rule_reward": -0.02019311930052936, "rewards/rule_reward": 0.0234375, "step": 159, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 6.451171875, "epoch": 0.09699909063352531, "grad_norm": 2.0549159049987793, "kl": 43.625, "learning_rate": 9.991333466786648e-06, "loss": 0.0437, "reward": 0.003294955473393202, "reward_std": 0.10787570476531982, "rewards/ndcg_rule_reward": -0.02209566906094551, "rewards/rule_reward": 0.025390625, "step": 160, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.09760533494998484, "grad_norm": 2.9707603454589844, "kl": 26.5, "learning_rate": 9.99104226198373e-06, "loss": 0.0265, "reward": 0.0027614463469944894, "reward_std": 0.09128262102603912, "rewards/ndcg_rule_reward": -0.018722929060459137, "rewards/rule_reward": 0.021484375, "step": 161, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.09821157926644437, "grad_norm": 3.0609514713287354, "kl": 26.109375, "learning_rate": 9.990746249677308e-06, "loss": 0.0261, "reward": 0.003689868957735598, "reward_std": 0.11608732491731644, "rewards/ndcg_rule_reward": -0.02365388162434101, "rewards/rule_reward": 0.02734375, "step": 162, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.0988178235829039, "grad_norm": 1.4569913148880005, "kl": 13.75, "learning_rate": 9.990445430152507e-06, "loss": 0.0138, "reward": 0.0030165896750986576, "reward_std": 0.08274522796273232, "rewards/ndcg_rule_reward": -0.01651466079056263, "rewards/rule_reward": 0.01953125, "step": 163, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.283203125, "epoch": 0.09942406789936345, "grad_norm": 6.210915565490723, "kl": 11.625, "learning_rate": 9.990139803699085e-06, "loss": 0.0116, "reward": 0.0033546199556440115, "reward_std": 0.12465114891529083, "rewards/ndcg_rule_reward": -0.025942254811525345, "rewards/rule_reward": 0.029296875, "step": 164, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10003031221582298, "grad_norm": 1.7273335456848145, "kl": 16.4375, "learning_rate": 9.98982937061143e-06, "loss": 0.0164, "reward": 0.004488120088353753, "reward_std": 0.14097020775079727, "rewards/ndcg_rule_reward": -0.028715004213154316, "rewards/rule_reward": 0.033203125, "step": 165, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.13671875, "epoch": 0.10063655653228251, "grad_norm": 1.4359859228134155, "kl": 9.6875, "learning_rate": 9.98951413118856e-06, "loss": 0.0097, "reward": 0.003243251354433596, "reward_std": 0.11632727831602097, "rewards/ndcg_rule_reward": -0.024100499227643013, "rewards/rule_reward": 0.02734375, "step": 166, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10124280084874204, "grad_norm": 4.5463762283325195, "kl": 23.28125, "learning_rate": 9.98919408573412e-06, "loss": 0.0233, "reward": 0.002473915053997189, "reward_std": 0.09143567830324173, "rewards/ndcg_rule_reward": -0.019010460004210472, "rewards/rule_reward": 0.021484375, "step": 167, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10184904516520157, "grad_norm": 1.3885849714279175, "kl": 15.0, "learning_rate": 9.988869234556386e-06, "loss": 0.015, "reward": 0.003780083265155554, "reward_std": 0.11606220528483391, "rewards/ndcg_rule_reward": -0.023563667200505733, "rewards/rule_reward": 0.02734375, "step": 168, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1024552894816611, "grad_norm": 1.953348994255066, "kl": 15.609375, "learning_rate": 9.988539577968265e-06, "loss": 0.0156, "reward": 0.0015670931315980852, "reward_std": 0.07498951256275177, "rewards/ndcg_rule_reward": -0.016011031344532967, "rewards/rule_reward": 0.017578125, "step": 169, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10306153379812064, "grad_norm": 1.0479294061660767, "kl": 10.8125, "learning_rate": 9.98820511628729e-06, "loss": 0.0108, "reward": 0.0027167208027094603, "reward_std": 0.09129361435770988, "rewards/ndcg_rule_reward": -0.018767654430121183, "rewards/rule_reward": 0.021484375, "step": 170, "token_diversity": 0.32421875 }, { "categorical_diversity": 1.0, "completion_length": 5.166015625, "epoch": 0.10366777811458018, "grad_norm": 1.322417140007019, "kl": 22.8125, "learning_rate": 9.987865849835626e-06, "loss": 0.0228, "reward": 0.004146653925999999, "reward_std": 0.12427961453795433, "rewards/ndcg_rule_reward": -0.025150220841169357, "rewards/rule_reward": 0.029296875, "step": 171, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10427402243103971, "grad_norm": 1.2571022510528564, "kl": 18.03125, "learning_rate": 9.98752177894006e-06, "loss": 0.018, "reward": 0.002986523322761059, "reward_std": 0.108014065772295, "rewards/ndcg_rule_reward": -0.022404102608561516, "rewards/rule_reward": 0.025390625, "step": 172, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10488026674749924, "grad_norm": 1.3610097169876099, "kl": 16.890625, "learning_rate": 9.98717290393201e-06, "loss": 0.0169, "reward": 0.003820057725533843, "reward_std": 0.11604608222842216, "rewards/ndcg_rule_reward": -0.023523693438619375, "rewards/rule_reward": 0.02734375, "step": 173, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10548651106395877, "grad_norm": 1.5536508560180664, "kl": 22.5625, "learning_rate": 9.986819225147521e-06, "loss": 0.0226, "reward": 0.004173900466412306, "reward_std": 0.13273730129003525, "rewards/ndcg_rule_reward": -0.027076099067926407, "rewards/rule_reward": 0.03125, "step": 174, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1060927553804183, "grad_norm": 1.525434136390686, "kl": 20.5, "learning_rate": 9.98646074292727e-06, "loss": 0.0205, "reward": 0.003756241174414754, "reward_std": 0.1328967735171318, "rewards/ndcg_rule_reward": -0.02749375905841589, "rewards/rule_reward": 0.03125, "step": 175, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10669899969687784, "grad_norm": 1.5447075366973877, "kl": 19.0, "learning_rate": 9.986097457616554e-06, "loss": 0.019, "reward": 0.00348559592384845, "reward_std": 0.11622222512960434, "rewards/ndcg_rule_reward": -0.023858153261244297, "rewards/rule_reward": 0.02734375, "step": 176, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.04296875, "epoch": 0.10730524401333738, "grad_norm": 2.089912176132202, "kl": 25.875, "learning_rate": 9.985729369565299e-06, "loss": 0.0258, "reward": 0.003420403809286654, "reward_std": 0.11624148115515709, "rewards/ndcg_rule_reward": -0.023923346772789955, "rewards/rule_reward": 0.02734375, "step": 177, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10791148832979691, "grad_norm": 2.201260805130005, "kl": 30.703125, "learning_rate": 9.985356479128056e-06, "loss": 0.0308, "reward": 0.004272771766409278, "reward_std": 0.13266821950674057, "rewards/ndcg_rule_reward": -0.026977227069437504, "rewards/rule_reward": 0.03125, "step": 178, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.10851773264625644, "grad_norm": 1.6947476863861084, "kl": 28.75, "learning_rate": 9.984978786664004e-06, "loss": 0.0287, "reward": 0.003409674856811762, "reward_std": 0.12467524409294128, "rewards/ndcg_rule_reward": -0.025887200608849525, "rewards/rule_reward": 0.029296875, "step": 179, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.02734375, "epoch": 0.10912397696271597, "grad_norm": 1.8790035247802734, "kl": 29.125, "learning_rate": 9.984596292536948e-06, "loss": 0.0291, "reward": 0.004663507686927915, "reward_std": 0.1493155136704445, "rewards/ndcg_rule_reward": -0.030492743477225304, "rewards/rule_reward": 0.03515625, "step": 180, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0234375, "epoch": 0.1097302212791755, "grad_norm": 1.5913363695144653, "kl": 24.375, "learning_rate": 9.984208997115314e-06, "loss": 0.0244, "reward": 0.004337949911132455, "reward_std": 0.13262248039245605, "rewards/ndcg_rule_reward": -0.026912051253020763, "rewards/rule_reward": 0.03125, "step": 181, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11033646559563504, "grad_norm": 1.3276008367538452, "kl": 20.875, "learning_rate": 9.983816900772156e-06, "loss": 0.0208, "reward": 0.003142483765259385, "reward_std": 0.09953096508979797, "rewards/ndcg_rule_reward": -0.02029501646757126, "rewards/rule_reward": 0.0234375, "step": 182, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11094270991209458, "grad_norm": 2.4598286151885986, "kl": 22.59375, "learning_rate": 9.983420003885148e-06, "loss": 0.0226, "reward": 0.003984748851507902, "reward_std": 0.14961977303028107, "rewards/ndcg_rule_reward": -0.03117150068283081, "rewards/rule_reward": 0.03515625, "step": 183, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11154895422855411, "grad_norm": 2.036473512649536, "kl": 26.9375, "learning_rate": 9.983018306836601e-06, "loss": 0.027, "reward": 0.0029725772328674793, "reward_std": 0.11644170060753822, "rewards/ndcg_rule_reward": -0.024371173232793808, "rewards/rule_reward": 0.02734375, "step": 184, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11215519854501364, "grad_norm": 1.4313404560089111, "kl": 36.3125, "learning_rate": 9.98261181001343e-06, "loss": 0.0363, "reward": 0.004024435766041279, "reward_std": 0.14960427209734917, "rewards/ndcg_rule_reward": -0.031131813302636147, "rewards/rule_reward": 0.03515625, "step": 185, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11276144286147317, "grad_norm": 8.69056224822998, "kl": 42.5, "learning_rate": 9.98220051380719e-06, "loss": 0.0425, "reward": 0.0048340801149606705, "reward_std": 0.18288075923919678, "rewards/ndcg_rule_reward": -0.03813466988503933, "rewards/rule_reward": 0.04296875, "step": 186, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1133676871779327, "grad_norm": 8.139336585998535, "kl": 53.125, "learning_rate": 9.981784418614048e-06, "loss": 0.0531, "reward": 0.0031133830780163407, "reward_std": 0.10797418281435966, "rewards/ndcg_rule_reward": -0.02227724250406027, "rewards/rule_reward": 0.025390625, "step": 187, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11397393149439224, "grad_norm": 2.7505133152008057, "kl": 34.625, "learning_rate": 9.9813635248348e-06, "loss": 0.0345, "reward": 0.0025488974060863256, "reward_std": 0.09980884194374084, "rewards/ndcg_rule_reward": -0.020888603292405605, "rewards/rule_reward": 0.0234375, "step": 188, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.67578125, "epoch": 0.11458017581085177, "grad_norm": 1.279457449913025, "kl": 14.34375, "learning_rate": 9.980937832874864e-06, "loss": 0.0143, "reward": 0.002641516039147973, "reward_std": 0.09132087603211403, "rewards/ndcg_rule_reward": -0.018842860125005245, "rewards/rule_reward": 0.021484375, "step": 189, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11518642012731131, "grad_norm": 3.49721097946167, "kl": 19.71875, "learning_rate": 9.980507343144273e-06, "loss": 0.0198, "reward": 0.002330141025595367, "reward_std": 0.09148626029491425, "rewards/ndcg_rule_reward": -0.019154234789311886, "rewards/rule_reward": 0.021484375, "step": 190, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11579266444377084, "grad_norm": 1.7191952466964722, "kl": 28.3125, "learning_rate": 9.98007205605769e-06, "loss": 0.0283, "reward": 0.002685699029825628, "reward_std": 0.09133867174386978, "rewards/ndcg_rule_reward": -0.018798675388097763, "rewards/rule_reward": 0.021484375, "step": 191, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11639890876023037, "grad_norm": 1.5413095951080322, "kl": 34.0, "learning_rate": 9.979631972034393e-06, "loss": 0.034, "reward": 0.003643923904746771, "reward_std": 0.1245337724685669, "rewards/ndcg_rule_reward": -0.025652951560914516, "rewards/rule_reward": 0.029296875, "step": 192, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1170051530766899, "grad_norm": 1.3878930807113647, "kl": 12.78125, "learning_rate": 9.979187091498283e-06, "loss": 0.0128, "reward": 0.002899164450354874, "reward_std": 0.10806329175829887, "rewards/ndcg_rule_reward": -0.022491460666060448, "rewards/rule_reward": 0.025390625, "step": 193, "token_diversity": 0.36328125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11761139739314944, "grad_norm": 1.4548381567001343, "kl": 15.4375, "learning_rate": 9.978737414877882e-06, "loss": 0.0154, "reward": 0.0031390669755637646, "reward_std": 0.0995258055627346, "rewards/ndcg_rule_reward": -0.020298432558774948, "rewards/rule_reward": 0.0234375, "step": 194, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11821764170960897, "grad_norm": 15.222572326660156, "kl": 42.75, "learning_rate": 9.978282942606329e-06, "loss": 0.0427, "reward": 0.002832579892128706, "reward_std": 0.12494215369224548, "rewards/ndcg_rule_reward": -0.026464294642210007, "rewards/rule_reward": 0.029296875, "step": 195, "token_diversity": 0.35546875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11882388602606851, "grad_norm": 5.825077056884766, "kl": 42.625, "learning_rate": 9.977823675121382e-06, "loss": 0.0426, "reward": 0.003770269569940865, "reward_std": 0.11607187986373901, "rewards/ndcg_rule_reward": -0.02357348147779703, "rewards/rule_reward": 0.02734375, "step": 196, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.11943013034252804, "grad_norm": 1.5069775581359863, "kl": 26.3125, "learning_rate": 9.977359612865424e-06, "loss": 0.0263, "reward": 0.0036322467494755983, "reward_std": 0.12453678250312805, "rewards/ndcg_rule_reward": -0.02566462755203247, "rewards/rule_reward": 0.029296875, "step": 197, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12003637465898757, "grad_norm": 1.5078891515731812, "kl": 28.375, "learning_rate": 9.97689075628545e-06, "loss": 0.0284, "reward": 0.004196232301183045, "reward_std": 0.13270164653658867, "rewards/ndcg_rule_reward": -0.02705376874655485, "rewards/rule_reward": 0.03125, "step": 198, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1206426189754471, "grad_norm": 1.831913709640503, "kl": 27.84375, "learning_rate": 9.97641710583307e-06, "loss": 0.0278, "reward": 0.0030996419955044985, "reward_std": 0.13315801694989204, "rewards/ndcg_rule_reward": -0.028150358237326145, "rewards/rule_reward": 0.03125, "step": 199, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12124886329190664, "grad_norm": 1.7019771337509155, "kl": 11.9765625, "learning_rate": 9.975938661964523e-06, "loss": 0.012, "reward": 0.0022032004198990762, "reward_std": 0.09155081957578659, "rewards/ndcg_rule_reward": -0.019281175453215837, "rewards/rule_reward": 0.021484375, "step": 200, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12185510760836617, "grad_norm": 1.1559094190597534, "kl": 23.9375, "learning_rate": 9.97545542514066e-06, "loss": 0.0239, "reward": 0.003443082794547081, "reward_std": 0.09937525913119316, "rewards/ndcg_rule_reward": -0.01999441720545292, "rewards/rule_reward": 0.0234375, "step": 201, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1224613519248257, "grad_norm": 3.206912040710449, "kl": 35.375, "learning_rate": 9.974967395826941e-06, "loss": 0.0353, "reward": 0.0039285566890612245, "reward_std": 0.13283997401595116, "rewards/ndcg_rule_reward": -0.027321443893015385, "rewards/rule_reward": 0.03125, "step": 202, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12306759624128524, "grad_norm": 1.8363721370697021, "kl": 24.90625, "learning_rate": 9.974474574493452e-06, "loss": 0.0249, "reward": 0.0032121778931468725, "reward_std": 0.11630330979824066, "rewards/ndcg_rule_reward": -0.024131571874022484, "rewards/rule_reward": 0.02734375, "step": 203, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12367384055774477, "grad_norm": 2.3955841064453125, "kl": 36.0625, "learning_rate": 9.973976961614893e-06, "loss": 0.036, "reward": 0.004168208921328187, "reward_std": 0.1579272821545601, "rewards/ndcg_rule_reward": -0.03294116631150246, "rewards/rule_reward": 0.037109375, "step": 204, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1242800848742043, "grad_norm": 2.110304117202759, "kl": 38.5, "learning_rate": 9.973474557670574e-06, "loss": 0.0385, "reward": 0.004486023681238294, "reward_std": 0.1494011953473091, "rewards/ndcg_rule_reward": -0.03067022655159235, "rewards/rule_reward": 0.03515625, "step": 205, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12488632919066384, "grad_norm": 1.069045901298523, "kl": 13.31640625, "learning_rate": 9.972967363144428e-06, "loss": 0.0133, "reward": 0.002840168308466673, "reward_std": 0.09126375243067741, "rewards/ndcg_rule_reward": -0.018644206691533327, "rewards/rule_reward": 0.021484375, "step": 206, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.12549257350712337, "grad_norm": 1.4294668436050415, "kl": 19.4375, "learning_rate": 9.972455378524997e-06, "loss": 0.0194, "reward": 0.002639963699039072, "reward_std": 0.09978482127189636, "rewards/ndcg_rule_reward": -0.020797536708414555, "rewards/rule_reward": 0.0234375, "step": 207, "token_diversity": 0.4609085648148148 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1260988178235829, "grad_norm": 1.9169923067092896, "kl": 42.625, "learning_rate": 9.971938604305435e-06, "loss": 0.0428, "reward": 0.0038089079316705465, "reward_std": 0.13287436962127686, "rewards/ndcg_rule_reward": -0.02744109369814396, "rewards/rule_reward": 0.03125, "step": 208, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12670506214004243, "grad_norm": 1.534973382949829, "kl": 29.3125, "learning_rate": 9.971417040983515e-06, "loss": 0.0293, "reward": 0.00376955175306648, "reward_std": 0.12445768341422081, "rewards/ndcg_rule_reward": -0.02552732266485691, "rewards/rule_reward": 0.029296875, "step": 209, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12731130645650196, "grad_norm": 2.476195812225342, "kl": 40.25, "learning_rate": 9.970890689061622e-06, "loss": 0.0403, "reward": 0.0031803178717382252, "reward_std": 0.09951521456241608, "rewards/ndcg_rule_reward": -0.02025718241930008, "rewards/rule_reward": 0.0234375, "step": 210, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1279175507729615, "grad_norm": 2.4606173038482666, "kl": 24.34375, "learning_rate": 9.97035954904675e-06, "loss": 0.0244, "reward": 0.0032970933243632317, "reward_std": 0.10787918418645859, "rewards/ndcg_rule_reward": -0.022093532606959343, "rewards/rule_reward": 0.025390625, "step": 211, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12852379508942105, "grad_norm": 1.4935859441757202, "kl": 23.875, "learning_rate": 9.969823621450508e-06, "loss": 0.0239, "reward": 0.002700387383811176, "reward_std": 0.099726852029562, "rewards/ndcg_rule_reward": -0.020737112499773502, "rewards/rule_reward": 0.0234375, "step": 212, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.12913003940588058, "grad_norm": 7.240038871765137, "kl": 44.0, "learning_rate": 9.969282906789113e-06, "loss": 0.0441, "reward": 0.003463640983682126, "reward_std": 0.13302382454276085, "rewards/ndcg_rule_reward": -0.027786359190940857, "rewards/rule_reward": 0.03125, "step": 213, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1297362837223401, "grad_norm": 4.055492401123047, "kl": 24.3125, "learning_rate": 9.968737405583398e-06, "loss": 0.0243, "reward": 0.0029682761523872614, "reward_std": 0.09958542883396149, "rewards/ndcg_rule_reward": -0.020469224080443382, "rewards/rule_reward": 0.0234375, "step": 214, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13034252803879964, "grad_norm": 1.6444170475006104, "kl": 31.875, "learning_rate": 9.968187118358805e-06, "loss": 0.0318, "reward": 0.0045228018425405025, "reward_std": 0.1493872031569481, "rewards/ndcg_rule_reward": -0.03063344955444336, "rewards/rule_reward": 0.03515625, "step": 215, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13094877235525917, "grad_norm": 2.560565948486328, "kl": 16.8125, "learning_rate": 9.967632045645384e-06, "loss": 0.0168, "reward": 0.002416811534203589, "reward_std": 0.10824510082602501, "rewards/ndcg_rule_reward": -0.02297381404787302, "rewards/rule_reward": 0.025390625, "step": 216, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1315550166717187, "grad_norm": 1.2639368772506714, "kl": 10.25, "learning_rate": 9.967072187977793e-06, "loss": 0.0103, "reward": 0.002950678113847971, "reward_std": 0.1080230101943016, "rewards/ndcg_rule_reward": -0.02243994642049074, "rewards/rule_reward": 0.025390625, "step": 217, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13216126098817824, "grad_norm": 1.877079963684082, "kl": 12.90625, "learning_rate": 9.966507545895307e-06, "loss": 0.0129, "reward": 0.003289498039521277, "reward_std": 0.10788034647703171, "rewards/ndcg_rule_reward": -0.022101127542555332, "rewards/rule_reward": 0.025390625, "step": 218, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13276750530463777, "grad_norm": 1.2321727275848389, "kl": 9.859375, "learning_rate": 9.965938119941801e-06, "loss": 0.0099, "reward": 0.0026523707201704383, "reward_std": 0.08292706124484539, "rewards/ndcg_rule_reward": -0.016878879396244884, "rewards/rule_reward": 0.01953125, "step": 219, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1333737496210973, "grad_norm": 1.9402889013290405, "kl": 18.234375, "learning_rate": 9.965363910665762e-06, "loss": 0.0183, "reward": 0.003438123967498541, "reward_std": 0.13306547328829765, "rewards/ndcg_rule_reward": -0.027811876498162746, "rewards/rule_reward": 0.03125, "step": 220, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13397999393755683, "grad_norm": 1.1144007444381714, "kl": 4.8828125, "learning_rate": 9.964784918620284e-06, "loss": 0.0049, "reward": 0.0012405419838614762, "reward_std": 0.06676003336906433, "rewards/ndcg_rule_reward": -0.014384457841515541, "rewards/rule_reward": 0.015625, "step": 221, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13458623825401636, "grad_norm": 1.3669874668121338, "kl": 16.6875, "learning_rate": 9.964201144363064e-06, "loss": 0.0167, "reward": 0.0025108184199780226, "reward_std": 0.10821764543652534, "rewards/ndcg_rule_reward": -0.022879806347191334, "rewards/rule_reward": 0.025390625, "step": 222, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1351924825704759, "grad_norm": 6.439158916473389, "kl": 26.125, "learning_rate": 9.963612588456413e-06, "loss": 0.0261, "reward": 0.004306566319428384, "reward_std": 0.14105799421668053, "rewards/ndcg_rule_reward": -0.028896557167172432, "rewards/rule_reward": 0.033203125, "step": 223, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13579872688693542, "grad_norm": 3.3186240196228027, "kl": 35.875, "learning_rate": 9.963019251467241e-06, "loss": 0.0358, "reward": 0.003378933761268854, "reward_std": 0.11621813476085663, "rewards/ndcg_rule_reward": -0.023964816704392433, "rewards/rule_reward": 0.02734375, "step": 224, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13640497120339498, "grad_norm": 1.2260864973068237, "kl": 14.125, "learning_rate": 9.962421133967067e-06, "loss": 0.0141, "reward": 0.003148105926811695, "reward_std": 0.09952403977513313, "rewards/ndcg_rule_reward": -0.020289394073188305, "rewards/rule_reward": 0.0234375, "step": 225, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1370112155198545, "grad_norm": 1.3529484272003174, "kl": 25.625, "learning_rate": 9.961818236532012e-06, "loss": 0.0256, "reward": 0.002375202369876206, "reward_std": 0.09149914607405663, "rewards/ndcg_rule_reward": -0.01910917228087783, "rewards/rule_reward": 0.021484375, "step": 226, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13761745983631404, "grad_norm": 1.7283827066421509, "kl": 26.0, "learning_rate": 9.961210559742805e-06, "loss": 0.026, "reward": 0.0040992326103150845, "reward_std": 0.1411764770746231, "rewards/ndcg_rule_reward": -0.029103892855346203, "rewards/rule_reward": 0.033203125, "step": 227, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13822370415277357, "grad_norm": 1.4354116916656494, "kl": 20.3125, "learning_rate": 9.960598104184775e-06, "loss": 0.0203, "reward": 0.002220508176833391, "reward_std": 0.07469014823436737, "rewards/ndcg_rule_reward": -0.015357617288827896, "rewards/rule_reward": 0.017578125, "step": 228, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1388299484692331, "grad_norm": 1.147745132446289, "kl": 29.0, "learning_rate": 9.959980870447853e-06, "loss": 0.0289, "reward": 0.0018963312031701207, "reward_std": 0.06642504408955574, "rewards/ndcg_rule_reward": -0.013728668913245201, "rewards/rule_reward": 0.015625, "step": 229, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.13943619278569264, "grad_norm": 1.542182445526123, "kl": 30.875, "learning_rate": 9.959358859126582e-06, "loss": 0.0309, "reward": 0.003834324306808412, "reward_std": 0.10761530697345734, "rewards/ndcg_rule_reward": -0.021556301042437553, "rewards/rule_reward": 0.025390625, "step": 230, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14004243710215217, "grad_norm": 2.035217046737671, "kl": 42.0, "learning_rate": 9.958732070820092e-06, "loss": 0.042, "reward": 0.004005961120128632, "reward_std": 0.12438423186540604, "rewards/ndcg_rule_reward": -0.02529091387987137, "rewards/rule_reward": 0.029296875, "step": 231, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1406486814186117, "grad_norm": 1.3242509365081787, "kl": 17.34375, "learning_rate": 9.958100506132127e-06, "loss": 0.0173, "reward": 0.002904911176301539, "reward_std": 0.09964482113718987, "rewards/ndcg_rule_reward": -0.02053258940577507, "rewards/rule_reward": 0.0234375, "step": 232, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14125492573507123, "grad_norm": 9.165868759155273, "kl": 85.875, "learning_rate": 9.957464165671022e-06, "loss": 0.0861, "reward": 0.0031822238815948367, "reward_std": 0.09951488301157951, "rewards/ndcg_rule_reward": -0.02025527600198984, "rewards/rule_reward": 0.0234375, "step": 233, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14186117005153076, "grad_norm": 3.3375096321105957, "kl": 22.625, "learning_rate": 9.956823050049722e-06, "loss": 0.0226, "reward": 0.003818535595200956, "reward_std": 0.11604771763086319, "rewards/ndcg_rule_reward": -0.02352521475404501, "rewards/rule_reward": 0.02734375, "step": 234, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1424674143679903, "grad_norm": 1.6122366189956665, "kl": 11.0625, "learning_rate": 9.956177159885765e-06, "loss": 0.0111, "reward": 0.0012853299267590046, "reward_std": 0.07514073699712753, "rewards/ndcg_rule_reward": -0.016292794607579708, "rewards/rule_reward": 0.017578125, "step": 235, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14307365868444982, "grad_norm": 2.007629632949829, "kl": 21.1875, "learning_rate": 9.955526495801287e-06, "loss": 0.0212, "reward": 0.0022227864246815443, "reward_std": 0.08312361687421799, "rewards/ndcg_rule_reward": -0.017308464273810387, "rewards/rule_reward": 0.01953125, "step": 236, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.14367990300090935, "grad_norm": 2.600972890853882, "kl": 31.5, "learning_rate": 9.954871058423026e-06, "loss": 0.0315, "reward": 0.004094116156920791, "reward_std": 0.14115600287914276, "rewards/ndcg_rule_reward": -0.02910900954157114, "rewards/rule_reward": 0.033203125, "step": 237, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1442861473173689, "grad_norm": 3.4248836040496826, "kl": 17.84375, "learning_rate": 9.95421084838232e-06, "loss": 0.0179, "reward": 0.004110949346795678, "reward_std": 0.13272163644433022, "rewards/ndcg_rule_reward": -0.027139050886034966, "rewards/rule_reward": 0.03125, "step": 238, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14489239163382844, "grad_norm": 1.5609183311462402, "kl": 16.0625, "learning_rate": 9.953545866315094e-06, "loss": 0.016, "reward": 0.0038896837504580617, "reward_std": 0.14968076348304749, "rewards/ndcg_rule_reward": -0.031266567297279835, "rewards/rule_reward": 0.03515625, "step": 239, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14549863595028797, "grad_norm": 0.9374715089797974, "kl": 5.05859375, "learning_rate": 9.952876112861882e-06, "loss": 0.0051, "reward": 0.0018188042449764907, "reward_std": 0.07490970194339752, "rewards/ndcg_rule_reward": -0.015759320929646492, "rewards/rule_reward": 0.017578125, "step": 240, "token_diversity": 0.5390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1461048802667475, "grad_norm": 1.9270473718643188, "kl": 18.125, "learning_rate": 9.952201588667805e-06, "loss": 0.0181, "reward": 0.0036332266172394156, "reward_std": 0.12452812120318413, "rewards/ndcg_rule_reward": -0.025663647800683975, "rewards/rule_reward": 0.029296875, "step": 241, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14671112458320704, "grad_norm": 1.5544735193252563, "kl": 31.1875, "learning_rate": 9.951522294382585e-06, "loss": 0.0311, "reward": 0.0033780259545892477, "reward_std": 0.116254523396492, "rewards/ndcg_rule_reward": -0.023965724743902683, "rewards/rule_reward": 0.02734375, "step": 242, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14731736889966657, "grad_norm": 1.1275932788848877, "kl": 18.3125, "learning_rate": 9.950838230660535e-06, "loss": 0.0183, "reward": 0.003812560345977545, "reward_std": 0.11604946851730347, "rewards/ndcg_rule_reward": -0.023531189188361168, "rewards/rule_reward": 0.02734375, "step": 243, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1479236132161261, "grad_norm": 1.4885159730911255, "kl": 31.9375, "learning_rate": 9.950149398160562e-06, "loss": 0.0319, "reward": 0.0032824600348249078, "reward_std": 0.09105430915951729, "rewards/ndcg_rule_reward": -0.0182019155472517, "rewards/rule_reward": 0.021484375, "step": 244, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.14852985753258563, "grad_norm": 1.5881165266036987, "kl": 43.5625, "learning_rate": 9.949455797546168e-06, "loss": 0.0436, "reward": 0.003317782422527671, "reward_std": 0.11626932770013809, "rewards/ndcg_rule_reward": -0.02402596641331911, "rewards/rule_reward": 0.02734375, "step": 245, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 7.880859375, "epoch": 0.14913610184904516, "grad_norm": 1.3747881650924683, "kl": 15.84375, "learning_rate": 9.94875742948545e-06, "loss": 0.0158, "reward": 0.0029149666079320014, "reward_std": 0.0996648408472538, "rewards/ndcg_rule_reward": -0.02052253345027566, "rewards/rule_reward": 0.0234375, "step": 246, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.76953125, "epoch": 0.1497423461655047, "grad_norm": 2.203338623046875, "kl": 44.5, "learning_rate": 9.94805429465109e-06, "loss": 0.0446, "reward": 0.005497462581843138, "reward_std": 0.19940226525068283, "rewards/ndcg_rule_reward": -0.041377536952495575, "rewards/rule_reward": 0.046875, "step": 247, "token_diversity": 0.41484937050359716 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15034859048196422, "grad_norm": 1.1104381084442139, "kl": 17.21875, "learning_rate": 9.947346393720372e-06, "loss": 0.0172, "reward": 0.003195245051756501, "reward_std": 0.10791803523898125, "rewards/ndcg_rule_reward": -0.022195380181074142, "rewards/rule_reward": 0.025390625, "step": 248, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 6.587890625, "epoch": 0.15095483479842375, "grad_norm": 1.7636610269546509, "kl": 31.84375, "learning_rate": 9.946633727375158e-06, "loss": 0.0318, "reward": 0.002727240789681673, "reward_std": 0.09968985989689827, "rewards/ndcg_rule_reward": -0.020710259675979614, "rewards/rule_reward": 0.0234375, "step": 249, "token_diversity": 0.27393353174603174 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15156107911488328, "grad_norm": 1.8702973127365112, "kl": 43.875, "learning_rate": 9.945916296301914e-06, "loss": 0.0439, "reward": 0.0035013570450246334, "reward_std": 0.11617690324783325, "rewards/ndcg_rule_reward": -0.023842393420636654, "rewards/rule_reward": 0.02734375, "step": 250, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15216732343134284, "grad_norm": 1.614966869354248, "kl": 34.8125, "learning_rate": 9.945194101191682e-06, "loss": 0.0348, "reward": 0.00318886898458004, "reward_std": 0.10791410133242607, "rewards/ndcg_rule_reward": -0.022201756946742535, "rewards/rule_reward": 0.025390625, "step": 251, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15277356774780237, "grad_norm": 7.732960224151611, "kl": 44.5, "learning_rate": 9.944467142740104e-06, "loss": 0.0445, "reward": 0.003103943890891969, "reward_std": 0.10797019302845001, "rewards/ndcg_rule_reward": -0.022286681458353996, "rewards/rule_reward": 0.025390625, "step": 252, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.2578125, "epoch": 0.1533798120642619, "grad_norm": 1.8057838678359985, "kl": 30.65625, "learning_rate": 9.943735421647404e-06, "loss": 0.0306, "reward": 0.0019194140331819654, "reward_std": 0.06641627475619316, "rewards/ndcg_rule_reward": -0.013705586548894644, "rewards/rule_reward": 0.015625, "step": 253, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15398605638072144, "grad_norm": 1.3822717666625977, "kl": 12.46875, "learning_rate": 9.942998938618394e-06, "loss": 0.0124, "reward": 0.0032484575640410185, "reward_std": 0.1078592874109745, "rewards/ndcg_rule_reward": -0.022142167203128338, "rewards/rule_reward": 0.025390625, "step": 254, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15459230069718097, "grad_norm": 1.8244844675064087, "kl": 35.625, "learning_rate": 9.942257694362476e-06, "loss": 0.0356, "reward": 0.002973754075355828, "reward_std": 0.10800963640213013, "rewards/ndcg_rule_reward": -0.02241687197238207, "rewards/rule_reward": 0.025390625, "step": 255, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 6.03125, "epoch": 0.1551985450136405, "grad_norm": 2.64068865776062, "kl": 34.625, "learning_rate": 9.941511689593634e-06, "loss": 0.0347, "reward": 0.0033434974029660225, "reward_std": 0.10783670097589493, "rewards/ndcg_rule_reward": -0.022047127597033978, "rewards/rule_reward": 0.025390625, "step": 256, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15580478933010003, "grad_norm": 2.2459280490875244, "kl": 31.9375, "learning_rate": 9.94076092503044e-06, "loss": 0.0319, "reward": 0.004318315302953124, "reward_std": 0.166309654712677, "rewards/ndcg_rule_reward": -0.03474418446421623, "rewards/rule_reward": 0.0390625, "step": 257, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15641103364655956, "grad_norm": 1.2507753372192383, "kl": 27.4375, "learning_rate": 9.94000540139605e-06, "loss": 0.0275, "reward": 0.0029103331035003066, "reward_std": 0.0996362678706646, "rewards/ndcg_rule_reward": -0.02052716724574566, "rewards/rule_reward": 0.0234375, "step": 258, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.318359375, "epoch": 0.1570172779630191, "grad_norm": 1.2056535482406616, "kl": 40.0625, "learning_rate": 9.939245119418208e-06, "loss": 0.0401, "reward": 0.003831595415249467, "reward_std": 0.11601485311985016, "rewards/ndcg_rule_reward": -0.02351215574890375, "rewards/rule_reward": 0.02734375, "step": 259, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15762352227947862, "grad_norm": 1.8153704404830933, "kl": 21.25, "learning_rate": 9.938480079829232e-06, "loss": 0.0212, "reward": 0.002977226162329316, "reward_std": 0.11643825843930244, "rewards/ndcg_rule_reward": -0.024366524070501328, "rewards/rule_reward": 0.02734375, "step": 260, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15822976659593815, "grad_norm": 2.033562660217285, "kl": 23.03125, "learning_rate": 9.937710283366032e-06, "loss": 0.0231, "reward": 0.0031108789844438434, "reward_std": 0.10793989524245262, "rewards/ndcg_rule_reward": -0.022279745899140835, "rewards/rule_reward": 0.025390625, "step": 261, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15883601091239768, "grad_norm": 1.719241738319397, "kl": 15.78125, "learning_rate": 9.936935730770093e-06, "loss": 0.0158, "reward": 0.003125484101474285, "reward_std": 0.10796520859003067, "rewards/ndcg_rule_reward": -0.022265140898525715, "rewards/rule_reward": 0.025390625, "step": 262, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.15944225522885722, "grad_norm": 2.241523027420044, "kl": 16.765625, "learning_rate": 9.936156422787488e-06, "loss": 0.0168, "reward": 0.003109463141299784, "reward_std": 0.0995398759841919, "rewards/ndcg_rule_reward": -0.020328037440776825, "rewards/rule_reward": 0.0234375, "step": 263, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16004849954531677, "grad_norm": 1.5047575235366821, "kl": 44.8125, "learning_rate": 9.935372360168865e-06, "loss": 0.0448, "reward": 0.0030982820317149162, "reward_std": 0.1079329252243042, "rewards/ndcg_rule_reward": -0.02229234389960766, "rewards/rule_reward": 0.025390625, "step": 264, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1606547438617763, "grad_norm": 1.3887983560562134, "kl": 16.6875, "learning_rate": 9.934583543669454e-06, "loss": 0.0167, "reward": 0.00253418181091547, "reward_std": 0.09138576686382294, "rewards/ndcg_rule_reward": -0.018950194120407104, "rewards/rule_reward": 0.021484375, "step": 265, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16126098817823584, "grad_norm": 2.0822856426239014, "kl": 19.15625, "learning_rate": 9.933789974049064e-06, "loss": 0.0191, "reward": 0.003652912797406316, "reward_std": 0.11613679677248001, "rewards/ndcg_rule_reward": -0.023690836504101753, "rewards/rule_reward": 0.02734375, "step": 266, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16186723249469537, "grad_norm": 1.530821442604065, "kl": 41.5625, "learning_rate": 9.932991652072084e-06, "loss": 0.0415, "reward": 0.002516221080441028, "reward_std": 0.08299155905842781, "rewards/ndcg_rule_reward": -0.017015029676258564, "rewards/rule_reward": 0.01953125, "step": 267, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1624734768111549, "grad_norm": 1.6347402334213257, "kl": 33.3125, "learning_rate": 9.932188578507475e-06, "loss": 0.0333, "reward": 0.003996520768851042, "reward_std": 0.14122692868113518, "rewards/ndcg_rule_reward": -0.02920660562813282, "rewards/rule_reward": 0.033203125, "step": 268, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16307972112761443, "grad_norm": 2.8814172744750977, "kl": 25.6875, "learning_rate": 9.931380754128783e-06, "loss": 0.0256, "reward": 0.003182480693794787, "reward_std": 0.12473859637975693, "rewards/ndcg_rule_reward": -0.026114394888281822, "rewards/rule_reward": 0.029296875, "step": 269, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16368596544407396, "grad_norm": 2.3460681438446045, "kl": 40.375, "learning_rate": 9.930568179714122e-06, "loss": 0.0403, "reward": 0.0032009717542678118, "reward_std": 0.10790961980819702, "rewards/ndcg_rule_reward": -0.022189653478562832, "rewards/rule_reward": 0.025390625, "step": 270, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1642922097605335, "grad_norm": 1.8400081396102905, "kl": 37.3125, "learning_rate": 9.929750856046187e-06, "loss": 0.0373, "reward": 0.0027181506156921387, "reward_std": 0.1081540659070015, "rewards/ndcg_rule_reward": -0.02267247438430786, "rewards/rule_reward": 0.025390625, "step": 271, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16489845407699302, "grad_norm": 1.2690964937210083, "kl": 23.0625, "learning_rate": 9.928928783912246e-06, "loss": 0.0231, "reward": 0.0021935676923021674, "reward_std": 0.0831325463950634, "rewards/ndcg_rule_reward": -0.017337682656943798, "rewards/rule_reward": 0.01953125, "step": 272, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16550469839345255, "grad_norm": 1.1781011819839478, "kl": 17.59375, "learning_rate": 9.928101964104137e-06, "loss": 0.0176, "reward": 0.0016218723030760884, "reward_std": 0.0665770135819912, "rewards/ndcg_rule_reward": -0.014003127813339233, "rewards/rule_reward": 0.015625, "step": 273, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16611094270991208, "grad_norm": 1.5034773349761963, "kl": 28.8125, "learning_rate": 9.927270397418281e-06, "loss": 0.0288, "reward": 0.004124015220440924, "reward_std": 0.1159178838133812, "rewards/ndcg_rule_reward": -0.023219735361635685, "rewards/rule_reward": 0.02734375, "step": 274, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16671718702637162, "grad_norm": 17.42371368408203, "kl": 60.375, "learning_rate": 9.926434084655658e-06, "loss": 0.0604, "reward": 0.002548347576521337, "reward_std": 0.08295351266860962, "rewards/ndcg_rule_reward": -0.01698290277272463, "rewards/rule_reward": 0.01953125, "step": 275, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16732343134283117, "grad_norm": 1.8386266231536865, "kl": 32.375, "learning_rate": 9.925593026621833e-06, "loss": 0.0323, "reward": 0.004302919143810868, "reward_std": 0.1326543688774109, "rewards/ndcg_rule_reward": -0.026947081089019775, "rewards/rule_reward": 0.03125, "step": 276, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1679296756592907, "grad_norm": 1.2298511266708374, "kl": 21.8125, "learning_rate": 9.924747224126932e-06, "loss": 0.0218, "reward": 0.002975119394250214, "reward_std": 0.08275575190782547, "rewards/ndcg_rule_reward": -0.016556130722165108, "rewards/rule_reward": 0.01953125, "step": 277, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16853591997575024, "grad_norm": 1.7239320278167725, "kl": 27.625, "learning_rate": 9.923896677985653e-06, "loss": 0.0277, "reward": 0.004189550876617432, "reward_std": 0.14109517633914948, "rewards/ndcg_rule_reward": -0.029013575986027718, "rewards/rule_reward": 0.033203125, "step": 278, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.16914216429220977, "grad_norm": 2.0664851665496826, "kl": 26.0625, "learning_rate": 9.923041389017266e-06, "loss": 0.0261, "reward": 0.0031865128548815846, "reward_std": 0.12474089860916138, "rewards/ndcg_rule_reward": -0.026110361330211163, "rewards/rule_reward": 0.029296875, "step": 279, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1697484086086693, "grad_norm": 3.3158698081970215, "kl": 34.5625, "learning_rate": 9.922181358045608e-06, "loss": 0.0345, "reward": 0.0038807825185358524, "reward_std": 0.14128848537802696, "rewards/ndcg_rule_reward": -0.029322342947125435, "rewards/rule_reward": 0.033203125, "step": 280, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17035465292512883, "grad_norm": 1.3390153646469116, "kl": 21.1875, "learning_rate": 9.921316585899083e-06, "loss": 0.0212, "reward": 0.002958644472528249, "reward_std": 0.09961631894111633, "rewards/ndcg_rule_reward": -0.020478855818510056, "rewards/rule_reward": 0.0234375, "step": 281, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17096089724158836, "grad_norm": 1.6369826793670654, "kl": 31.75, "learning_rate": 9.920447073410664e-06, "loss": 0.0318, "reward": 0.0032788285170681775, "reward_std": 0.12467684969305992, "rewards/ndcg_rule_reward": -0.0260180477052927, "rewards/rule_reward": 0.029296875, "step": 282, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1715671415580479, "grad_norm": 89.83685302734375, "kl": 264.39453125, "learning_rate": 9.919572821417887e-06, "loss": 0.2653, "reward": 0.0028655982459895313, "reward_std": 0.10804547369480133, "rewards/ndcg_rule_reward": -0.02252502553164959, "rewards/rule_reward": 0.025390625, "step": 283, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17217338587450742, "grad_norm": 1.6804890632629395, "kl": 22.25, "learning_rate": 9.918693830762853e-06, "loss": 0.0222, "reward": 0.00388420931994915, "reward_std": 0.11600178852677345, "rewards/ndcg_rule_reward": -0.02345954068005085, "rewards/rule_reward": 0.02734375, "step": 284, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17277963019096695, "grad_norm": 2.0195422172546387, "kl": 50.4375, "learning_rate": 9.917810102292233e-06, "loss": 0.0505, "reward": 0.003206973779015243, "reward_std": 0.0995098240673542, "rewards/ndcg_rule_reward": -0.020230526104569435, "rewards/rule_reward": 0.0234375, "step": 285, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.17338587450742649, "grad_norm": 2.7074739933013916, "kl": 47.625, "learning_rate": 9.916921636857253e-06, "loss": 0.0477, "reward": 0.004434977425262332, "reward_std": 0.1578158661723137, "rewards/ndcg_rule_reward": -0.03267439641058445, "rewards/rule_reward": 0.037109375, "step": 286, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17399211882388602, "grad_norm": 1.9060746431350708, "kl": 27.3125, "learning_rate": 9.91602843531371e-06, "loss": 0.0274, "reward": 0.00303670612629503, "reward_std": 0.10799491032958031, "rewards/ndcg_rule_reward": -0.022353919222950935, "rewards/rule_reward": 0.025390625, "step": 287, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17459836314034555, "grad_norm": 3.310220718383789, "kl": 33.8125, "learning_rate": 9.915130498521956e-06, "loss": 0.0339, "reward": 0.0035798688186332583, "reward_std": 0.12455740571022034, "rewards/ndcg_rule_reward": -0.025717006996273994, "rewards/rule_reward": 0.029296875, "step": 288, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1752046074568051, "grad_norm": 2.209610939025879, "kl": 27.6875, "learning_rate": 9.91422782734691e-06, "loss": 0.0277, "reward": 0.003612605854868889, "reward_std": 0.1245780773460865, "rewards/ndcg_rule_reward": -0.02568426914513111, "rewards/rule_reward": 0.029296875, "step": 289, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17581085177326464, "grad_norm": 1.0949413776397705, "kl": 16.03125, "learning_rate": 9.913320422658048e-06, "loss": 0.016, "reward": 0.002263823407702148, "reward_std": 0.08314058929681778, "rewards/ndcg_rule_reward": -0.017267427407205105, "rewards/rule_reward": 0.01953125, "step": 290, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17641709608972417, "grad_norm": 1.4855234622955322, "kl": 38.6875, "learning_rate": 9.912408285329406e-06, "loss": 0.0387, "reward": 0.004022290464490652, "reward_std": 0.12434931099414825, "rewards/ndcg_rule_reward": -0.02527458406984806, "rewards/rule_reward": 0.029296875, "step": 291, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1770233404061837, "grad_norm": 1.907395839691162, "kl": 29.375, "learning_rate": 9.911491416239578e-06, "loss": 0.0293, "reward": 0.004266746807843447, "reward_std": 0.15792647004127502, "rewards/ndcg_rule_reward": -0.032842627726495266, "rewards/rule_reward": 0.037109375, "step": 292, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17762958472264323, "grad_norm": 2.719893455505371, "kl": 30.4375, "learning_rate": 9.910569816271717e-06, "loss": 0.0304, "reward": 0.0025694789364933968, "reward_std": 0.08294529467821121, "rewards/ndcg_rule_reward": -0.016961771994829178, "rewards/rule_reward": 0.01953125, "step": 293, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17823582903910276, "grad_norm": 2.6087028980255127, "kl": 29.40625, "learning_rate": 9.909643486313533e-06, "loss": 0.0294, "reward": 0.002883654786273837, "reward_std": 0.10806329548358917, "rewards/ndcg_rule_reward": -0.02250696998089552, "rewards/rule_reward": 0.025390625, "step": 294, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0625, "epoch": 0.1788420733555623, "grad_norm": 4.233660697937012, "kl": 13.65625, "learning_rate": 9.908712427257291e-06, "loss": 0.0137, "reward": 0.0019192193867638707, "reward_std": 0.04959258623421192, "rewards/ndcg_rule_reward": -0.009799530962482095, "rewards/rule_reward": 0.01171875, "step": 295, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.17944831767202182, "grad_norm": 1.8169139623641968, "kl": 21.875, "learning_rate": 9.907776639999814e-06, "loss": 0.0219, "reward": 0.003119796165265143, "reward_std": 0.09953759983181953, "rewards/ndcg_rule_reward": -0.020317704416811466, "rewards/rule_reward": 0.0234375, "step": 296, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18005456198848135, "grad_norm": 1.674465537071228, "kl": 23.875, "learning_rate": 9.906836125442472e-06, "loss": 0.0239, "reward": 0.002547107054851949, "reward_std": 0.09138219803571701, "rewards/ndcg_rule_reward": -0.01893726736307144, "rewards/rule_reward": 0.021484375, "step": 297, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 6.95703125, "epoch": 0.18066080630494089, "grad_norm": 3.319779634475708, "kl": 40.1875, "learning_rate": 9.905890884491196e-06, "loss": 0.0403, "reward": 0.0038278813008219004, "reward_std": 0.12444334477186203, "rewards/ndcg_rule_reward": -0.025468993932008743, "rewards/rule_reward": 0.029296875, "step": 298, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18126705062140042, "grad_norm": 1.2466994524002075, "kl": 31.0, "learning_rate": 9.904940918056467e-06, "loss": 0.0309, "reward": 0.0029147155582904816, "reward_std": 0.09963441640138626, "rewards/ndcg_rule_reward": -0.020522785373032093, "rewards/rule_reward": 0.0234375, "step": 299, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18187329493785995, "grad_norm": 2.0794506072998047, "kl": 37.8125, "learning_rate": 9.90398622705332e-06, "loss": 0.0378, "reward": 0.003148204181343317, "reward_std": 0.10792889073491096, "rewards/ndcg_rule_reward": -0.02224242128431797, "rewards/rule_reward": 0.025390625, "step": 300, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18247953925431948, "grad_norm": 2.692159652709961, "kl": 34.0, "learning_rate": 9.903026812401334e-06, "loss": 0.034, "reward": 0.003597068600356579, "reward_std": 0.14982564747333527, "rewards/ndcg_rule_reward": -0.03155918139964342, "rewards/rule_reward": 0.03515625, "step": 301, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18308578357077904, "grad_norm": 2.1052002906799316, "kl": 15.375, "learning_rate": 9.902062675024644e-06, "loss": 0.0154, "reward": 0.003165093599818647, "reward_std": 0.099526297301054, "rewards/ndcg_rule_reward": -0.020272407680749893, "rewards/rule_reward": 0.0234375, "step": 302, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 8.16015625, "epoch": 0.18369202788723857, "grad_norm": 1.5732333660125732, "kl": 41.0, "learning_rate": 9.901093815851935e-06, "loss": 0.0409, "reward": 0.004870378412306309, "reward_std": 0.14924628287553787, "rewards/ndcg_rule_reward": -0.03028587345033884, "rewards/rule_reward": 0.03515625, "step": 303, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1842982722036981, "grad_norm": 5.2313385009765625, "kl": 39.125, "learning_rate": 9.900120235816436e-06, "loss": 0.0391, "reward": 0.0035801518242806196, "reward_std": 0.12455111742019653, "rewards/ndcg_rule_reward": -0.025716722942888737, "rewards/rule_reward": 0.029296875, "step": 304, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18490451652015763, "grad_norm": 2.3328239917755127, "kl": 27.125, "learning_rate": 9.899141935855923e-06, "loss": 0.0271, "reward": 0.0036226300289854407, "reward_std": 0.1330011896789074, "rewards/ndcg_rule_reward": -0.02762736938893795, "rewards/rule_reward": 0.03125, "step": 305, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18551076083661716, "grad_norm": 1.6894643306732178, "kl": 27.125, "learning_rate": 9.898158916912723e-06, "loss": 0.0271, "reward": 0.0034482665359973907, "reward_std": 0.11622898653149605, "rewards/ndcg_rule_reward": -0.02389548346400261, "rewards/rule_reward": 0.02734375, "step": 306, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1861170051530767, "grad_norm": 3.7685980796813965, "kl": 45.375, "learning_rate": 9.897171179933706e-06, "loss": 0.0453, "reward": 0.00369845237582922, "reward_std": 0.12448274716734886, "rewards/ndcg_rule_reward": -0.025598421692848206, "rewards/rule_reward": 0.029296875, "step": 307, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18672324946953622, "grad_norm": 1.3227609395980835, "kl": 18.6875, "learning_rate": 9.896178725870287e-06, "loss": 0.0187, "reward": 0.002299778105225414, "reward_std": 0.09992126747965813, "rewards/ndcg_rule_reward": -0.02113772090524435, "rewards/rule_reward": 0.0234375, "step": 308, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18732949378599575, "grad_norm": 1.7337253093719482, "kl": 18.015625, "learning_rate": 9.895181555678419e-06, "loss": 0.018, "reward": 0.003372808452695608, "reward_std": 0.13308529183268547, "rewards/ndcg_rule_reward": -0.02787719201296568, "rewards/rule_reward": 0.03125, "step": 309, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18793573810245529, "grad_norm": 1.3471633195877075, "kl": 13.375, "learning_rate": 9.894179670318607e-06, "loss": 0.0134, "reward": 0.0030915997340343893, "reward_std": 0.10797886922955513, "rewards/ndcg_rule_reward": -0.022299026604741812, "rewards/rule_reward": 0.025390625, "step": 310, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.02734375, "epoch": 0.18854198241891482, "grad_norm": 1.2766430377960205, "kl": 16.03125, "learning_rate": 9.893173070755893e-06, "loss": 0.0161, "reward": 0.003158187959343195, "reward_std": 0.10792429000139236, "rewards/ndcg_rule_reward": -0.022232437506318092, "rewards/rule_reward": 0.025390625, "step": 311, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18914822673537435, "grad_norm": 1.713754653930664, "kl": 31.0, "learning_rate": 9.89216175795986e-06, "loss": 0.031, "reward": 0.004461031989194453, "reward_std": 0.12414921075105667, "rewards/ndcg_rule_reward": -0.024835842661559582, "rewards/rule_reward": 0.029296875, "step": 312, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.18975447105183388, "grad_norm": 2.871004104614258, "kl": 51.25, "learning_rate": 9.891145732904628e-06, "loss": 0.0512, "reward": 0.0039046132005751133, "reward_std": 0.1244230642914772, "rewards/ndcg_rule_reward": -0.025392262265086174, "rewards/rule_reward": 0.029296875, "step": 313, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1903607153682934, "grad_norm": 1.51375412940979, "kl": 22.46875, "learning_rate": 9.890124996568863e-06, "loss": 0.0224, "reward": 0.002862304332666099, "reward_std": 0.09964866563677788, "rewards/ndcg_rule_reward": -0.020575196482241154, "rewards/rule_reward": 0.0234375, "step": 314, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19096695968475297, "grad_norm": 2.0482730865478516, "kl": 40.25, "learning_rate": 9.889099549935765e-06, "loss": 0.0402, "reward": 0.003993600374087691, "reward_std": 0.13275455683469772, "rewards/ndcg_rule_reward": -0.027256399393081665, "rewards/rule_reward": 0.03125, "step": 315, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1915732040012125, "grad_norm": 3.6309030055999756, "kl": 60.5, "learning_rate": 9.888069393993069e-06, "loss": 0.0605, "reward": 0.002268837997689843, "reward_std": 0.08306685090065002, "rewards/ndcg_rule_reward": -0.017262411303818226, "rewards/rule_reward": 0.01953125, "step": 316, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19217944831767203, "grad_norm": 4.0640668869018555, "kl": 57.125, "learning_rate": 9.88703452973305e-06, "loss": 0.0573, "reward": 0.0039871841436252, "reward_std": 0.13279035314917564, "rewards/ndcg_rule_reward": -0.02726281713694334, "rewards/rule_reward": 0.03125, "step": 317, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.15234375, "epoch": 0.19278569263413156, "grad_norm": 1.6576069593429565, "kl": 41.375, "learning_rate": 9.885994958152518e-06, "loss": 0.0413, "reward": 0.003057503665331751, "reward_std": 0.0995847787708044, "rewards/ndcg_rule_reward": -0.020379996858537197, "rewards/rule_reward": 0.0234375, "step": 318, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1933919369505911, "grad_norm": 1.7887616157531738, "kl": 27.25, "learning_rate": 9.884950680252811e-06, "loss": 0.0273, "reward": 0.0033844661666080356, "reward_std": 0.1162518858909607, "rewards/ndcg_rule_reward": -0.023959284648299217, "rewards/rule_reward": 0.02734375, "step": 319, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19399818126705062, "grad_norm": 1.3486026525497437, "kl": 29.4375, "learning_rate": 9.883901697039809e-06, "loss": 0.0294, "reward": 0.0033664737129583955, "reward_std": 0.09942357987165451, "rewards/ndcg_rule_reward": -0.020071026869118214, "rewards/rule_reward": 0.0234375, "step": 320, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19460442558351015, "grad_norm": 1.8340590000152588, "kl": 33.5, "learning_rate": 9.882848009523919e-06, "loss": 0.0336, "reward": 0.0021081692539155483, "reward_std": 0.07475614920258522, "rewards/ndcg_rule_reward": -0.015469956211745739, "rewards/rule_reward": 0.017578125, "step": 321, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19521066989996969, "grad_norm": 1.7310140132904053, "kl": 18.75, "learning_rate": 9.88178961872008e-06, "loss": 0.0187, "reward": 0.0020909522427245975, "reward_std": 0.07476552948355675, "rewards/ndcg_rule_reward": -0.015487172175198793, "rewards/rule_reward": 0.017578125, "step": 322, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19581691421642922, "grad_norm": 1.1578727960586548, "kl": 10.435546875, "learning_rate": 9.880726525647764e-06, "loss": 0.0104, "reward": 0.0022225314751267433, "reward_std": 0.09994528070092201, "rewards/ndcg_rule_reward": -0.02121496805921197, "rewards/rule_reward": 0.0234375, "step": 323, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19642315853288875, "grad_norm": 1.3791279792785645, "kl": 29.875, "learning_rate": 9.879658731330969e-06, "loss": 0.0299, "reward": 0.003210873808711767, "reward_std": 0.10793584957718849, "rewards/ndcg_rule_reward": -0.02217975165694952, "rewards/rule_reward": 0.025390625, "step": 324, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19702940284934828, "grad_norm": 1.983180284500122, "kl": 25.46875, "learning_rate": 9.878586236798222e-06, "loss": 0.0254, "reward": 0.0023342272033914924, "reward_std": 0.09990864247083664, "rewards/ndcg_rule_reward": -0.021103273145854473, "rewards/rule_reward": 0.0234375, "step": 325, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1976356471658078, "grad_norm": 1.4968643188476562, "kl": 34.5625, "learning_rate": 9.877509043082579e-06, "loss": 0.0346, "reward": 0.0043333584908396006, "reward_std": 0.13261690735816956, "rewards/ndcg_rule_reward": -0.026916641741991043, "rewards/rule_reward": 0.03125, "step": 326, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.19824189148226734, "grad_norm": 5.088555335998535, "kl": 53.625, "learning_rate": 9.876427151221618e-06, "loss": 0.0535, "reward": 0.004827054915949702, "reward_std": 0.15768075734376907, "rewards/ndcg_rule_reward": -0.03228231891989708, "rewards/rule_reward": 0.037109375, "step": 327, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.1988481357987269, "grad_norm": 1.575700044631958, "kl": 19.875, "learning_rate": 9.875340562257454e-06, "loss": 0.0199, "reward": 0.0025000988971441984, "reward_std": 0.09140089526772499, "rewards/ndcg_rule_reward": -0.018984276801347733, "rewards/rule_reward": 0.021484375, "step": 328, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.19945438011518643, "grad_norm": 1.1372469663619995, "kl": 37.5, "learning_rate": 9.87424927723671e-06, "loss": 0.0375, "reward": 0.0035386853851377964, "reward_std": 0.10777036100625992, "rewards/ndcg_rule_reward": -0.02185194008052349, "rewards/rule_reward": 0.025390625, "step": 329, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20006062443164596, "grad_norm": 1.7733272314071655, "kl": 14.3515625, "learning_rate": 9.873153297210543e-06, "loss": 0.0144, "reward": 0.0029402487562038004, "reward_std": 0.12484219297766685, "rewards/ndcg_rule_reward": -0.02635662630200386, "rewards/rule_reward": 0.029296875, "step": 330, "token_diversity": 0.4609375 }, { "epoch": 0.20006062443164596, "eval_categorical_diversity": 1.0, "eval_completion_length": 5.0, "eval_kl": 11.454445422535212, "eval_loss": 0.011483034119009972, "eval_reward": 0.0012652353895172266, "eval_reward_std": 0.04723879453581824, "eval_rewards/ndcg_rule_reward": -0.0098345665536611, "eval_rewards/rule_reward": 0.011099801936619719, "eval_runtime": 85.1744, "eval_samples_per_second": 53.208, "eval_steps_per_second": 0.059, "eval_token_diversity": 0.34419014084507044, "step": 330 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.2006668687481055, "grad_norm": 2.42783784866333, "kl": 39.25, "learning_rate": 9.872052623234632e-06, "loss": 0.0392, "reward": 0.002991343499161303, "reward_std": 0.12482668459415436, "rewards/ndcg_rule_reward": -0.026305532082915306, "rewards/rule_reward": 0.029296875, "step": 331, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20127311306456502, "grad_norm": 23.2785701751709, "kl": 37.125, "learning_rate": 9.870947256369173e-06, "loss": 0.0372, "reward": 0.0042632941622287035, "reward_std": 0.14106997102499008, "rewards/ndcg_rule_reward": -0.028939830139279366, "rewards/rule_reward": 0.033203125, "step": 332, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20187935738102455, "grad_norm": 1.751656174659729, "kl": 18.375, "learning_rate": 9.869837197678882e-06, "loss": 0.0184, "reward": 0.003498121164739132, "reward_std": 0.13303977996110916, "rewards/ndcg_rule_reward": -0.027751878835260868, "rewards/rule_reward": 0.03125, "step": 333, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20248560169748409, "grad_norm": 1.3185906410217285, "kl": 25.125, "learning_rate": 9.868722448233003e-06, "loss": 0.0251, "reward": 0.0036242802161723375, "reward_std": 0.11614286154508591, "rewards/ndcg_rule_reward": -0.02371947094798088, "rewards/rule_reward": 0.02734375, "step": 334, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20309184601394362, "grad_norm": 1.8272603750228882, "kl": 36.6875, "learning_rate": 9.867603009105287e-06, "loss": 0.0367, "reward": 0.0037952057318761945, "reward_std": 0.13290955126285553, "rewards/ndcg_rule_reward": -0.027454794384539127, "rewards/rule_reward": 0.03125, "step": 335, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20369809033040315, "grad_norm": 1.342319130897522, "kl": 26.28125, "learning_rate": 9.86647888137401e-06, "loss": 0.0263, "reward": 0.0029766929801553488, "reward_std": 0.0996134877204895, "rewards/ndcg_rule_reward": -0.020460806787014008, "rewards/rule_reward": 0.0234375, "step": 336, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20430433464686268, "grad_norm": 1.5254642963409424, "kl": 24.375, "learning_rate": 9.865350066121962e-06, "loss": 0.0244, "reward": 0.003521128441207111, "reward_std": 0.116209477186203, "rewards/ndcg_rule_reward": -0.023822622373700142, "rewards/rule_reward": 0.02734375, "step": 337, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2049105789633222, "grad_norm": 2.012626886367798, "kl": 30.9375, "learning_rate": 9.864216564436444e-06, "loss": 0.0309, "reward": 0.003525597508996725, "reward_std": 0.1414339318871498, "rewards/ndcg_rule_reward": -0.029677527025341988, "rewards/rule_reward": 0.033203125, "step": 338, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20551682327978174, "grad_norm": 1.6922494173049927, "kl": 20.375, "learning_rate": 9.863078377409277e-06, "loss": 0.0204, "reward": 0.0021078716963529587, "reward_std": 0.0831536315381527, "rewards/ndcg_rule_reward": -0.01742337876930833, "rewards/rule_reward": 0.01953125, "step": 339, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20612306759624127, "grad_norm": 1.1455211639404297, "kl": 28.25, "learning_rate": 9.861935506136794e-06, "loss": 0.0283, "reward": 0.0031363690504804254, "reward_std": 0.09949565678834915, "rewards/ndcg_rule_reward": -0.02030113060027361, "rewards/rule_reward": 0.0234375, "step": 340, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20672931191270083, "grad_norm": 47.399452209472656, "kl": 30.3125, "learning_rate": 9.860787951719836e-06, "loss": 0.0304, "reward": 0.0026145321899093688, "reward_std": 0.08293753862380981, "rewards/ndcg_rule_reward": -0.016916717402637005, "rewards/rule_reward": 0.01953125, "step": 341, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20733555622916036, "grad_norm": 3.288428783416748, "kl": 41.75, "learning_rate": 9.85963571526376e-06, "loss": 0.0417, "reward": 0.0031727736350148916, "reward_std": 0.09951802343130112, "rewards/ndcg_rule_reward": -0.020264726597815752, "rewards/rule_reward": 0.0234375, "step": 342, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2079418005456199, "grad_norm": 2.214385986328125, "kl": 33.375, "learning_rate": 9.85847879787843e-06, "loss": 0.0334, "reward": 0.0036415562499314547, "reward_std": 0.1245357096195221, "rewards/ndcg_rule_reward": -0.025655318051576614, "rewards/rule_reward": 0.029296875, "step": 343, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20854804486207942, "grad_norm": 1.7042940855026245, "kl": 27.1875, "learning_rate": 9.857317200678219e-06, "loss": 0.0271, "reward": 0.0036496473476290703, "reward_std": 0.13296176493167877, "rewards/ndcg_rule_reward": -0.027600351721048355, "rewards/rule_reward": 0.03125, "step": 344, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20915428917853895, "grad_norm": 4.177464962005615, "kl": 34.3125, "learning_rate": 9.856150924782007e-06, "loss": 0.0343, "reward": 0.002686513355001807, "reward_std": 0.09130682796239853, "rewards/ndcg_rule_reward": -0.018797862343490124, "rewards/rule_reward": 0.021484375, "step": 345, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.20976053349499849, "grad_norm": 3.1711368560791016, "kl": 49.625, "learning_rate": 9.854979971313183e-06, "loss": 0.0497, "reward": 0.004218368325382471, "reward_std": 0.14108655974268913, "rewards/ndcg_rule_reward": -0.028984755277633667, "rewards/rule_reward": 0.033203125, "step": 346, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21036677781145802, "grad_norm": 1.5181578397750854, "kl": 34.375, "learning_rate": 9.85380434139964e-06, "loss": 0.0343, "reward": 0.002457446535117924, "reward_std": 0.08301140740513802, "rewards/ndcg_rule_reward": -0.017073804046958685, "rewards/rule_reward": 0.01953125, "step": 347, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21097302212791755, "grad_norm": 3.0012545585632324, "kl": 28.8125, "learning_rate": 9.852624036173775e-06, "loss": 0.0289, "reward": 0.0028319042176008224, "reward_std": 0.10805320367217064, "rewards/ndcg_rule_reward": -0.022558719851076603, "rewards/rule_reward": 0.025390625, "step": 348, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21157926644437708, "grad_norm": 1.956161618232727, "kl": 21.6875, "learning_rate": 9.85143905677249e-06, "loss": 0.0217, "reward": 0.0028301303973421454, "reward_std": 0.10805527120828629, "rewards/ndcg_rule_reward": -0.02256049495190382, "rewards/rule_reward": 0.025390625, "step": 349, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2121855107608366, "grad_norm": 1.6617988348007202, "kl": 18.0, "learning_rate": 9.85024940433719e-06, "loss": 0.018, "reward": 0.0037702692206948996, "reward_std": 0.10763894021511078, "rewards/ndcg_rule_reward": -0.021620355546474457, "rewards/rule_reward": 0.025390625, "step": 350, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21279175507729614, "grad_norm": 1.0543262958526611, "kl": 29.25, "learning_rate": 9.849055080013775e-06, "loss": 0.0293, "reward": 0.002112283604219556, "reward_std": 0.06632601097226143, "rewards/ndcg_rule_reward": -0.013512716628611088, "rewards/rule_reward": 0.015625, "step": 351, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21339799939375567, "grad_norm": 1.7185778617858887, "kl": 17.15625, "learning_rate": 9.847856084952653e-06, "loss": 0.0172, "reward": 0.0015815813676454127, "reward_std": 0.07498308643698692, "rewards/ndcg_rule_reward": -0.01599654322490096, "rewards/rule_reward": 0.017578125, "step": 352, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2140042437102152, "grad_norm": 2.344869375228882, "kl": 29.5625, "learning_rate": 9.846652420308728e-06, "loss": 0.0296, "reward": 0.002709215274080634, "reward_std": 0.10812956467270851, "rewards/ndcg_rule_reward": -0.022681409493088722, "rewards/rule_reward": 0.025390625, "step": 353, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21461048802667476, "grad_norm": 2.669954538345337, "kl": 21.78125, "learning_rate": 9.845444087241401e-06, "loss": 0.0218, "reward": 0.003506717039272189, "reward_std": 0.1161784827709198, "rewards/ndcg_rule_reward": -0.023837032727897167, "rewards/rule_reward": 0.02734375, "step": 354, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2152167323431343, "grad_norm": 5.11698579788208, "kl": 58.25, "learning_rate": 9.84423108691457e-06, "loss": 0.0584, "reward": 0.003578267991542816, "reward_std": 0.10772949457168579, "rewards/ndcg_rule_reward": -0.021812357008457184, "rewards/rule_reward": 0.025390625, "step": 355, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21582297665959382, "grad_norm": 1.576030969619751, "kl": 33.0625, "learning_rate": 9.843013420496628e-06, "loss": 0.0331, "reward": 0.003438027575612068, "reward_std": 0.1078110784292221, "rewards/ndcg_rule_reward": -0.021952597424387932, "rewards/rule_reward": 0.025390625, "step": 356, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.21642922097605335, "grad_norm": 1.8236455917358398, "kl": 23.75, "learning_rate": 9.841791089160463e-06, "loss": 0.0237, "reward": 0.003968513919971883, "reward_std": 0.1412183791399002, "rewards/ndcg_rule_reward": -0.029234610497951508, "rewards/rule_reward": 0.033203125, "step": 357, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21703546529251289, "grad_norm": 1.4939981698989868, "kl": 13.21875, "learning_rate": 9.840564094083461e-06, "loss": 0.0132, "reward": 0.0025590050499886274, "reward_std": 0.0998392328619957, "rewards/ndcg_rule_reward": -0.020878495648503304, "rewards/rule_reward": 0.0234375, "step": 358, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21764170960897242, "grad_norm": 1.1027085781097412, "kl": 29.5625, "learning_rate": 9.839332436447492e-06, "loss": 0.0296, "reward": 0.0030884892912581563, "reward_std": 0.10797382518649101, "rewards/ndcg_rule_reward": -0.02230213675647974, "rewards/rule_reward": 0.025390625, "step": 359, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21824795392543195, "grad_norm": 1.449570894241333, "kl": 13.375, "learning_rate": 9.838096117438922e-06, "loss": 0.0134, "reward": 0.0030482455622404814, "reward_std": 0.09958647936582565, "rewards/ndcg_rule_reward": -0.02038925513625145, "rewards/rule_reward": 0.0234375, "step": 360, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.21885419824189148, "grad_norm": 1.6289328336715698, "kl": 21.375, "learning_rate": 9.836855138248604e-06, "loss": 0.0214, "reward": 0.0033434585202485323, "reward_std": 0.12465458735823631, "rewards/ndcg_rule_reward": -0.02595341671258211, "rewards/rule_reward": 0.029296875, "step": 361, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.219460442558351, "grad_norm": 1.4999892711639404, "kl": 29.875, "learning_rate": 9.835609500071886e-06, "loss": 0.0299, "reward": 0.004763326374813914, "reward_std": 0.16607007384300232, "rewards/ndcg_rule_reward": -0.03429917432367802, "rewards/rule_reward": 0.0390625, "step": 362, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22006668687481054, "grad_norm": 1.8812617063522339, "kl": 41.9375, "learning_rate": 9.834359204108597e-06, "loss": 0.0419, "reward": 0.00467999407555908, "reward_std": 0.12404874712228775, "rewards/ndcg_rule_reward": -0.02461688034236431, "rewards/rule_reward": 0.029296875, "step": 363, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.22067293119127007, "grad_norm": 1.3194831609725952, "kl": 35.0625, "learning_rate": 9.833104251563058e-06, "loss": 0.0351, "reward": 0.0033535786205902696, "reward_std": 0.10783475637435913, "rewards/ndcg_rule_reward": -0.022037046030163765, "rewards/rule_reward": 0.025390625, "step": 364, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2212791755077296, "grad_norm": 1.6463432312011719, "kl": 29.25, "learning_rate": 9.831844643644067e-06, "loss": 0.0293, "reward": 0.0027661373023875058, "reward_std": 0.09971830621361732, "rewards/ndcg_rule_reward": -0.020671362057328224, "rewards/rule_reward": 0.0234375, "step": 365, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22188541982418916, "grad_norm": 2.2088708877563477, "kl": 43.0, "learning_rate": 9.830580381564916e-06, "loss": 0.0429, "reward": 0.002370418980717659, "reward_std": 0.08310208842158318, "rewards/ndcg_rule_reward": -0.01716083101928234, "rewards/rule_reward": 0.01953125, "step": 366, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2224916641406487, "grad_norm": 1.5753427743911743, "kl": 24.75, "learning_rate": 9.829311466543373e-06, "loss": 0.0248, "reward": 0.002542946022003889, "reward_std": 0.09981907904148102, "rewards/ndcg_rule_reward": -0.020894554443657398, "rewards/rule_reward": 0.0234375, "step": 367, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22309790845710822, "grad_norm": 2.4422216415405273, "kl": 68.75, "learning_rate": 9.828037899801693e-06, "loss": 0.0687, "reward": 0.004424390732310712, "reward_std": 0.13259224221110344, "rewards/ndcg_rule_reward": -0.02682560868561268, "rewards/rule_reward": 0.03125, "step": 368, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22370415277356775, "grad_norm": 1.513738751411438, "kl": 34.625, "learning_rate": 9.826759682566606e-06, "loss": 0.0346, "reward": 0.0020921012619510293, "reward_std": 0.06633023172616959, "rewards/ndcg_rule_reward": -0.013532898388803005, "rewards/rule_reward": 0.015625, "step": 369, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22431039709002729, "grad_norm": 2.8667256832122803, "kl": 31.4375, "learning_rate": 9.825476816069326e-06, "loss": 0.0315, "reward": 0.002781714778393507, "reward_std": 0.09127579629421234, "rewards/ndcg_rule_reward": -0.01870266068726778, "rewards/rule_reward": 0.021484375, "step": 370, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22491664140648682, "grad_norm": 2.9913315773010254, "kl": 43.25, "learning_rate": 9.824189301545545e-06, "loss": 0.0432, "reward": 0.003169870004057884, "reward_std": 0.0910898894071579, "rewards/ndcg_rule_reward": -0.01831450592726469, "rewards/rule_reward": 0.021484375, "step": 371, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22552288572294635, "grad_norm": 1.4995274543762207, "kl": 25.6875, "learning_rate": 9.82289714023543e-06, "loss": 0.0256, "reward": 0.00276505621150136, "reward_std": 0.10811063274741173, "rewards/ndcg_rule_reward": -0.022625569254159927, "rewards/rule_reward": 0.025390625, "step": 372, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22612913003940588, "grad_norm": 1.2580678462982178, "kl": 36.625, "learning_rate": 9.821600333383626e-06, "loss": 0.0366, "reward": 0.0036563293542712927, "reward_std": 0.0992760919034481, "rewards/ndcg_rule_reward": -0.01978117087855935, "rewards/rule_reward": 0.0234375, "step": 373, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2267353743558654, "grad_norm": 1.276029109954834, "kl": 31.125, "learning_rate": 9.820298882239248e-06, "loss": 0.0311, "reward": 0.0030268566915765405, "reward_std": 0.09959779679775238, "rewards/ndcg_rule_reward": -0.02041064389050007, "rewards/rule_reward": 0.0234375, "step": 374, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22734161867232494, "grad_norm": 1.4454584121704102, "kl": 37.75, "learning_rate": 9.81899278805589e-06, "loss": 0.0377, "reward": 0.003022562013939023, "reward_std": 0.10799940675497055, "rewards/ndcg_rule_reward": -0.022368064150214195, "rewards/rule_reward": 0.025390625, "step": 375, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22794786298878447, "grad_norm": 2.425662040710449, "kl": 54.625, "learning_rate": 9.817682052091618e-06, "loss": 0.0546, "reward": 0.00297637062612921, "reward_std": 0.108038779348135, "rewards/ndcg_rule_reward": -0.022414255887269974, "rewards/rule_reward": 0.025390625, "step": 376, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.228554107305244, "grad_norm": 2.818368434906006, "kl": 49.375, "learning_rate": 9.816366675608966e-06, "loss": 0.0495, "reward": 0.003649675054475665, "reward_std": 0.10770449787378311, "rewards/ndcg_rule_reward": -0.021740950644016266, "rewards/rule_reward": 0.025390625, "step": 377, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.22916035162170353, "grad_norm": 7.490729808807373, "kl": 65.1875, "learning_rate": 9.81504665987494e-06, "loss": 0.0654, "reward": 0.0038580797845497727, "reward_std": 0.1412612721323967, "rewards/ndcg_rule_reward": -0.029345044866204262, "rewards/rule_reward": 0.033203125, "step": 378, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2297665959381631, "grad_norm": 2.0185904502868652, "kl": 39.125, "learning_rate": 9.813722006161013e-06, "loss": 0.0391, "reward": 0.0030854003271088004, "reward_std": 0.11637590825557709, "rewards/ndcg_rule_reward": -0.024258350022137165, "rewards/rule_reward": 0.02734375, "step": 379, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.310546875, "epoch": 0.23037284025462262, "grad_norm": 1.6170560121536255, "kl": 33.5, "learning_rate": 9.812392715743126e-06, "loss": 0.0335, "reward": 0.004072598181664944, "reward_std": 0.14116289466619492, "rewards/ndcg_rule_reward": -0.029130526818335056, "rewards/rule_reward": 0.033203125, "step": 380, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23097908457108215, "grad_norm": 2.8323299884796143, "kl": 42.0, "learning_rate": 9.811058789901688e-06, "loss": 0.0419, "reward": 0.0031765245366841555, "reward_std": 0.09109440818428993, "rewards/ndcg_rule_reward": -0.0183078502304852, "rewards/rule_reward": 0.021484375, "step": 381, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23158532888754169, "grad_norm": 2.7883148193359375, "kl": 39.875, "learning_rate": 9.809720229921573e-06, "loss": 0.0399, "reward": 0.004075997741892934, "reward_std": 0.1327257975935936, "rewards/ndcg_rule_reward": -0.02717400249093771, "rewards/rule_reward": 0.03125, "step": 382, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.005859375, "epoch": 0.23219157320400122, "grad_norm": 1.4588723182678223, "kl": 18.6875, "learning_rate": 9.808377037092111e-06, "loss": 0.0187, "reward": 0.0027834156062453985, "reward_std": 0.11656175926327705, "rewards/ndcg_rule_reward": -0.024560336023569107, "rewards/rule_reward": 0.02734375, "step": 383, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23279781752046075, "grad_norm": 2.3274898529052734, "kl": 26.40625, "learning_rate": 9.807029212707109e-06, "loss": 0.0264, "reward": 0.003436178551055491, "reward_std": 0.1414174735546112, "rewards/ndcg_rule_reward": -0.029766947962343693, "rewards/rule_reward": 0.033203125, "step": 384, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23340406183692028, "grad_norm": 2.699503183364868, "kl": 35.5, "learning_rate": 9.805676758064822e-06, "loss": 0.0355, "reward": 0.003984399605542421, "reward_std": 0.13278736546635628, "rewards/ndcg_rule_reward": -0.02726559992879629, "rewards/rule_reward": 0.03125, "step": 385, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2340103061533798, "grad_norm": 2.1117236614227295, "kl": 33.9375, "learning_rate": 9.804319674467969e-06, "loss": 0.034, "reward": 0.0033067117910832167, "reward_std": 0.13312887027859688, "rewards/ndcg_rule_reward": -0.02794328797608614, "rewards/rule_reward": 0.03125, "step": 386, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23461655046983934, "grad_norm": 7.250090599060059, "kl": 60.75, "learning_rate": 9.802957963223731e-06, "loss": 0.0606, "reward": 0.0029215292306616902, "reward_std": 0.10803189128637314, "rewards/ndcg_rule_reward": -0.02246909588575363, "rewards/rule_reward": 0.025390625, "step": 387, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23522279478629887, "grad_norm": 1.439679741859436, "kl": 25.3125, "learning_rate": 9.801591625643744e-06, "loss": 0.0254, "reward": 0.002852912526577711, "reward_std": 0.09965835511684418, "rewards/ndcg_rule_reward": -0.020584587007761, "rewards/rule_reward": 0.0234375, "step": 388, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2358290391027584, "grad_norm": 3.411794900894165, "kl": 31.75, "learning_rate": 9.800220663044102e-06, "loss": 0.0318, "reward": 0.0030406697187572718, "reward_std": 0.09112852811813354, "rewards/ndcg_rule_reward": -0.018443705514073372, "rewards/rule_reward": 0.021484375, "step": 389, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23643528341921793, "grad_norm": 1.789502501487732, "kl": 12.5625, "learning_rate": 9.798845076745349e-06, "loss": 0.0125, "reward": 0.0032285742927342653, "reward_std": 0.1163310557603836, "rewards/ndcg_rule_reward": -0.024115175008773804, "rewards/rule_reward": 0.02734375, "step": 390, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23704152773567747, "grad_norm": 1.6939643621444702, "kl": 18.328125, "learning_rate": 9.797464868072489e-06, "loss": 0.0183, "reward": 0.0019077882752753794, "reward_std": 0.05802446976304054, "rewards/ndcg_rule_reward": -0.011764087248593569, "rewards/rule_reward": 0.013671875, "step": 391, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23764777205213702, "grad_norm": 1.4651020765304565, "kl": 19.890625, "learning_rate": 9.796080038354971e-06, "loss": 0.0198, "reward": 0.0028335824608802795, "reward_std": 0.09965549036860466, "rewards/ndcg_rule_reward": -0.02060391753911972, "rewards/rule_reward": 0.0234375, "step": 392, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23825401636859656, "grad_norm": 1.8330814838409424, "kl": 35.375, "learning_rate": 9.794690588926705e-06, "loss": 0.0353, "reward": 0.0031968937255442142, "reward_std": 0.09948137030005455, "rewards/ndcg_rule_reward": -0.0202406058087945, "rewards/rule_reward": 0.0234375, "step": 393, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23886026068505609, "grad_norm": 1.3173710107803345, "kl": 27.9375, "learning_rate": 9.79329652112604e-06, "loss": 0.028, "reward": 0.0025550280697643757, "reward_std": 0.08295147866010666, "rewards/ndcg_rule_reward": -0.016976221930235624, "rewards/rule_reward": 0.01953125, "step": 394, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.23946650500151562, "grad_norm": 1.7802239656448364, "kl": 20.171875, "learning_rate": 9.791897836295783e-06, "loss": 0.0202, "reward": 0.0022508191177621484, "reward_std": 0.08311119303107262, "rewards/ndcg_rule_reward": -0.01728043146431446, "rewards/rule_reward": 0.01953125, "step": 395, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24007274931797515, "grad_norm": 1.097015142440796, "kl": 27.234375, "learning_rate": 9.790494535783183e-06, "loss": 0.0273, "reward": 0.0023058372898958623, "reward_std": 0.09148667380213737, "rewards/ndcg_rule_reward": -0.019178538117557764, "rewards/rule_reward": 0.021484375, "step": 396, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24067899363443468, "grad_norm": 1.1104118824005127, "kl": 27.34375, "learning_rate": 9.789086620939936e-06, "loss": 0.0274, "reward": 0.0025345648755319417, "reward_std": 0.09139128774404526, "rewards/ndcg_rule_reward": -0.018949810415506363, "rewards/rule_reward": 0.021484375, "step": 397, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2412852379508942, "grad_norm": 13.80337905883789, "kl": 30.3125, "learning_rate": 9.787674093122182e-06, "loss": 0.0303, "reward": 0.0034163586096838117, "reward_std": 0.10781006887555122, "rewards/ndcg_rule_reward": -0.021974267438054085, "rewards/rule_reward": 0.025390625, "step": 398, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24189148226735374, "grad_norm": 2.0321264266967773, "kl": 34.1875, "learning_rate": 9.786256953690505e-06, "loss": 0.0342, "reward": 0.003526408690959215, "reward_std": 0.11617181822657585, "rewards/ndcg_rule_reward": -0.023817340843379498, "rewards/rule_reward": 0.02734375, "step": 399, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24249772658381327, "grad_norm": 1.6328909397125244, "kl": 42.875, "learning_rate": 9.784835204009932e-06, "loss": 0.0429, "reward": 0.0044435710879042745, "reward_std": 0.14101071655750275, "rewards/ndcg_rule_reward": -0.028759554028511047, "rewards/rule_reward": 0.033203125, "step": 400, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2431039709002728, "grad_norm": 2.2107417583465576, "kl": 55.25, "learning_rate": 9.78340884544993e-06, "loss": 0.0553, "reward": 0.004453946137800813, "reward_std": 0.14101392775774002, "rewards/ndcg_rule_reward": -0.028749180026352406, "rewards/rule_reward": 0.033203125, "step": 401, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24371021521673233, "grad_norm": 1.8192814588546753, "kl": 24.8125, "learning_rate": 9.781977879384403e-06, "loss": 0.0248, "reward": 0.0019835250568576157, "reward_std": 0.08322227746248245, "rewards/ndcg_rule_reward": -0.017547725699841976, "rewards/rule_reward": 0.01953125, "step": 402, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24431645953319187, "grad_norm": 1.5896401405334473, "kl": 28.875, "learning_rate": 9.780542307191698e-06, "loss": 0.0289, "reward": 0.002393391914665699, "reward_std": 0.08303504064679146, "rewards/ndcg_rule_reward": -0.0171378580853343, "rewards/rule_reward": 0.01953125, "step": 403, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2449227038496514, "grad_norm": 1.9869706630706787, "kl": 53.625, "learning_rate": 9.779102130254594e-06, "loss": 0.0538, "reward": 0.0031755580566823483, "reward_std": 0.09951749444007874, "rewards/ndcg_rule_reward": -0.02026194240897894, "rewards/rule_reward": 0.0234375, "step": 404, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24552894816611096, "grad_norm": 1.7612528800964355, "kl": 42.125, "learning_rate": 9.777657349960307e-06, "loss": 0.0422, "reward": 0.0044645387679338455, "reward_std": 0.14940145611763, "rewards/ndcg_rule_reward": -0.030691713094711304, "rewards/rule_reward": 0.03515625, "step": 405, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2461351924825705, "grad_norm": 1.4200701713562012, "kl": 29.5, "learning_rate": 9.77620796770049e-06, "loss": 0.0295, "reward": 0.0020352157298475504, "reward_std": 0.06638309359550476, "rewards/ndcg_rule_reward": -0.013589784502983093, "rewards/rule_reward": 0.015625, "step": 406, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24674143679903002, "grad_norm": 2.3813061714172363, "kl": 29.4375, "learning_rate": 9.774753984871227e-06, "loss": 0.0294, "reward": 0.0031578781781718135, "reward_std": 0.11631760001182556, "rewards/ndcg_rule_reward": -0.024185871705412865, "rewards/rule_reward": 0.02734375, "step": 407, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24734768111548955, "grad_norm": 1.49534010887146, "kl": 29.125, "learning_rate": 9.773295402873027e-06, "loss": 0.0291, "reward": 0.0029649913776665926, "reward_std": 0.09958548843860626, "rewards/ndcg_rule_reward": -0.02047250885516405, "rewards/rule_reward": 0.0234375, "step": 408, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24795392543194908, "grad_norm": 1.4694942235946655, "kl": 14.078125, "learning_rate": 9.77183222311084e-06, "loss": 0.014, "reward": 0.0024754127953201532, "reward_std": 0.0998365469276905, "rewards/ndcg_rule_reward": -0.02096208743751049, "rewards/rule_reward": 0.0234375, "step": 409, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2485601697484086, "grad_norm": 2.439582109451294, "kl": 36.0, "learning_rate": 9.770364446994035e-06, "loss": 0.0359, "reward": 0.004124098224565387, "reward_std": 0.14114350825548172, "rewards/ndcg_rule_reward": -0.029079025611281395, "rewards/rule_reward": 0.033203125, "step": 410, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24916641406486814, "grad_norm": 1.5106351375579834, "kl": 20.8125, "learning_rate": 9.768892075936418e-06, "loss": 0.0208, "reward": 0.003599384566769004, "reward_std": 0.11614682525396347, "rewards/ndcg_rule_reward": -0.023744365200400352, "rewards/rule_reward": 0.02734375, "step": 411, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.24977265838132767, "grad_norm": 3.0149104595184326, "kl": 48.625, "learning_rate": 9.76741511135621e-06, "loss": 0.0486, "reward": 0.003776647034101188, "reward_std": 0.12446337193250656, "rewards/ndcg_rule_reward": -0.02552022784948349, "rewards/rule_reward": 0.029296875, "step": 412, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25037890269778723, "grad_norm": 11.116964340209961, "kl": 27.6875, "learning_rate": 9.765933554676066e-06, "loss": 0.0277, "reward": 0.0030119982548058033, "reward_std": 0.09957165271043777, "rewards/ndcg_rule_reward": -0.020425502210855484, "rewards/rule_reward": 0.0234375, "step": 413, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25098514701424673, "grad_norm": 1.499633550643921, "kl": 35.9375, "learning_rate": 9.764447407323058e-06, "loss": 0.036, "reward": 0.003625058801844716, "reward_std": 0.12453844025731087, "rewards/ndcg_rule_reward": -0.025671816430985928, "rewards/rule_reward": 0.029296875, "step": 414, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2515913913307063, "grad_norm": 3.29518985748291, "kl": 32.25, "learning_rate": 9.762956670728685e-06, "loss": 0.0322, "reward": 0.003601418691687286, "reward_std": 0.124573465436697, "rewards/ndcg_rule_reward": -0.025695456191897392, "rewards/rule_reward": 0.029296875, "step": 415, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2521976356471658, "grad_norm": 1.8416502475738525, "kl": 28.625, "learning_rate": 9.761461346328863e-06, "loss": 0.0286, "reward": 0.003416358958929777, "reward_std": 0.11624299362301826, "rewards/ndcg_rule_reward": -0.02392739150673151, "rewards/rule_reward": 0.02734375, "step": 416, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25280387996362536, "grad_norm": 1.5168263912200928, "kl": 25.375, "learning_rate": 9.759961435563928e-06, "loss": 0.0254, "reward": 0.0029998377431184053, "reward_std": 0.11642570793628693, "rewards/ndcg_rule_reward": -0.02434391248971224, "rewards/rule_reward": 0.02734375, "step": 417, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25341012428008486, "grad_norm": 2.472905397415161, "kl": 25.75, "learning_rate": 9.75845693987863e-06, "loss": 0.0259, "reward": 0.003347428049892187, "reward_std": 0.10783354565501213, "rewards/ndcg_rule_reward": -0.022043196484446526, "rewards/rule_reward": 0.025390625, "step": 418, "token_diversity": 0.36328125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2540163685965444, "grad_norm": 1.1222459077835083, "kl": 18.5, "learning_rate": 9.756947860722143e-06, "loss": 0.0185, "reward": 0.003061440249439329, "reward_std": 0.10798170790076256, "rewards/ndcg_rule_reward": -0.02232918469235301, "rewards/rule_reward": 0.025390625, "step": 419, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2546226129130039, "grad_norm": 1.5183781385421753, "kl": 22.3125, "learning_rate": 9.755434199548051e-06, "loss": 0.0224, "reward": 0.001673659891821444, "reward_std": 0.06652488559484482, "rewards/ndcg_rule_reward": -0.01395133975893259, "rewards/rule_reward": 0.015625, "step": 420, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2552288572294635, "grad_norm": 1.5149637460708618, "kl": 27.5, "learning_rate": 9.753915957814353e-06, "loss": 0.0275, "reward": 0.0027386004221625626, "reward_std": 0.0913291946053505, "rewards/ndcg_rule_reward": -0.01874577533453703, "rewards/rule_reward": 0.021484375, "step": 421, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.255835101545923, "grad_norm": 1.4570001363754272, "kl": 21.0625, "learning_rate": 9.752393136983457e-06, "loss": 0.021, "reward": 0.003240818274207413, "reward_std": 0.1247241161763668, "rewards/ndcg_rule_reward": -0.026056057773530483, "rewards/rule_reward": 0.029296875, "step": 422, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25644134586238254, "grad_norm": 1.3217922449111938, "kl": 18.875, "learning_rate": 9.750865738522187e-06, "loss": 0.0188, "reward": 0.002448564860969782, "reward_std": 0.09984337911009789, "rewards/ndcg_rule_reward": -0.020988935604691505, "rewards/rule_reward": 0.0234375, "step": 423, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2570475901788421, "grad_norm": 1.5569100379943848, "kl": 39.9375, "learning_rate": 9.74933376390177e-06, "loss": 0.04, "reward": 0.0034938190365210176, "reward_std": 0.11618609726428986, "rewards/ndcg_rule_reward": -0.02384993154555559, "rewards/rule_reward": 0.02734375, "step": 424, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.2576538344953016, "grad_norm": 3.2962303161621094, "kl": 47.5, "learning_rate": 9.747797214597849e-06, "loss": 0.0475, "reward": 0.0038514903280884027, "reward_std": 0.11603721231222153, "rewards/ndcg_rule_reward": -0.02349225990474224, "rewards/rule_reward": 0.02734375, "step": 425, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25826007881176116, "grad_norm": 1.8633646965026855, "kl": 36.5, "learning_rate": 9.746256092090465e-06, "loss": 0.0365, "reward": 0.003016589500475675, "reward_std": 0.09956896118819714, "rewards/ndcg_rule_reward": -0.02042091079056263, "rewards/rule_reward": 0.0234375, "step": 426, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.25886632312822067, "grad_norm": 2.1346471309661865, "kl": 35.1875, "learning_rate": 9.744710397864068e-06, "loss": 0.0352, "reward": 0.00383273686747998, "reward_std": 0.13284069299697876, "rewards/ndcg_rule_reward": -0.027417263016104698, "rewards/rule_reward": 0.03125, "step": 427, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2594725674446802, "grad_norm": 1.7870104312896729, "kl": 30.3125, "learning_rate": 9.743160133407514e-06, "loss": 0.0303, "reward": 0.0022096362663432956, "reward_std": 0.07469236105680466, "rewards/ndcg_rule_reward": -0.015368489548563957, "rewards/rule_reward": 0.017578125, "step": 428, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2600788117611397, "grad_norm": 2.6828012466430664, "kl": 34.375, "learning_rate": 9.741605300214057e-06, "loss": 0.0344, "reward": 0.004918053047731519, "reward_std": 0.18281487375497818, "rewards/ndcg_rule_reward": -0.0380506981164217, "rewards/rule_reward": 0.04296875, "step": 429, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2606850560775993, "grad_norm": 1.207011342048645, "kl": 45.75, "learning_rate": 9.740045899781353e-06, "loss": 0.0458, "reward": 0.0038468181155622005, "reward_std": 0.11600739881396294, "rewards/ndcg_rule_reward": -0.023496931418776512, "rewards/rule_reward": 0.02734375, "step": 430, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2612913003940588, "grad_norm": 1.200925350189209, "kl": 34.3125, "learning_rate": 9.738481933611459e-06, "loss": 0.0343, "reward": 0.003140753600746393, "reward_std": 0.09952540695667267, "rewards/ndcg_rule_reward": -0.020296746864914894, "rewards/rule_reward": 0.0234375, "step": 431, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.26189754471051835, "grad_norm": 1.1762505769729614, "kl": 27.984375, "learning_rate": 9.736913403210826e-06, "loss": 0.028, "reward": 0.003593187779188156, "reward_std": 0.11615586280822754, "rewards/ndcg_rule_reward": -0.023750562220811844, "rewards/rule_reward": 0.02734375, "step": 432, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.26250378902697785, "grad_norm": 1.9187965393066406, "kl": 28.5625, "learning_rate": 9.735340310090306e-06, "loss": 0.0285, "reward": 0.0029625900206156075, "reward_std": 0.0827888771891594, "rewards/ndcg_rule_reward": -0.01656865980476141, "rewards/rule_reward": 0.01953125, "step": 433, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2631100333434374, "grad_norm": 1.4762510061264038, "kl": 28.53125, "learning_rate": 9.733762655765145e-06, "loss": 0.0285, "reward": 0.002884030109271407, "reward_std": 0.11643096059560776, "rewards/ndcg_rule_reward": -0.02445971965789795, "rewards/rule_reward": 0.02734375, "step": 434, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.36328125, "epoch": 0.2637162776598969, "grad_norm": 3.0625438690185547, "kl": 34.25, "learning_rate": 9.732180441754977e-06, "loss": 0.0343, "reward": 0.0034891741815954447, "reward_std": 0.12461597844958305, "rewards/ndcg_rule_reward": -0.025807702913880348, "rewards/rule_reward": 0.029296875, "step": 435, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2643225219763565, "grad_norm": 2.0832607746124268, "kl": 39.25, "learning_rate": 9.730593669583836e-06, "loss": 0.0393, "reward": 0.00362060172483325, "reward_std": 0.12457526475191116, "rewards/ndcg_rule_reward": -0.025676274672150612, "rewards/rule_reward": 0.029296875, "step": 436, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.26492876629281603, "grad_norm": 5.2092366218566895, "kl": 46.375, "learning_rate": 9.72900234078014e-06, "loss": 0.0464, "reward": 0.0037179160863161087, "reward_std": 0.12454912438988686, "rewards/ndcg_rule_reward": -0.02557895891368389, "rewards/rule_reward": 0.029296875, "step": 437, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.26553501060927553, "grad_norm": 1.1659578084945679, "kl": 29.8125, "learning_rate": 9.727406456876703e-06, "loss": 0.0298, "reward": 0.003251983202062547, "reward_std": 0.10789615288376808, "rewards/ndcg_rule_reward": -0.02213864214718342, "rewards/rule_reward": 0.025390625, "step": 438, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2661412549257351, "grad_norm": 2.5614519119262695, "kl": 38.0, "learning_rate": 9.725806019410718e-06, "loss": 0.038, "reward": 0.0032245147740468383, "reward_std": 0.1079254075884819, "rewards/ndcg_rule_reward": -0.022166110575199127, "rewards/rule_reward": 0.025390625, "step": 439, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2667474992421946, "grad_norm": 1.723532795906067, "kl": 28.3125, "learning_rate": 9.72420102992377e-06, "loss": 0.0284, "reward": 0.0032136451336555183, "reward_std": 0.1163368821144104, "rewards/ndcg_rule_reward": -0.02413010597229004, "rewards/rule_reward": 0.02734375, "step": 440, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.26735374355865416, "grad_norm": 2.2921111583709717, "kl": 40.25, "learning_rate": 9.722591489961829e-06, "loss": 0.0402, "reward": 0.003667256096377969, "reward_std": 0.11609869450330734, "rewards/ndcg_rule_reward": -0.023676494136452675, "rewards/rule_reward": 0.02734375, "step": 441, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.015625, "epoch": 0.26795998787511366, "grad_norm": 1.2866371870040894, "kl": 20.8125, "learning_rate": 9.720977401075243e-06, "loss": 0.0208, "reward": 0.0024907244369387627, "reward_std": 0.10822967812418938, "rewards/ndcg_rule_reward": -0.022899900563061237, "rewards/rule_reward": 0.025390625, "step": 442, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.115234375, "epoch": 0.2685662321915732, "grad_norm": 2.0503854751586914, "kl": 58.875, "learning_rate": 9.719358764818748e-06, "loss": 0.059, "reward": 0.00436501600779593, "reward_std": 0.14103581011295319, "rewards/ndcg_rule_reward": -0.02883811015635729, "rewards/rule_reward": 0.033203125, "step": 443, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2691724765080327, "grad_norm": 1.6162832975387573, "kl": 55.375, "learning_rate": 9.717735582751454e-06, "loss": 0.0554, "reward": 0.004558513173833489, "reward_std": 0.14095087349414825, "rewards/ndcg_rule_reward": -0.02864461112767458, "rewards/rule_reward": 0.033203125, "step": 444, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.1171875, "epoch": 0.2697787208244923, "grad_norm": 6.628988742828369, "kl": 30.1875, "learning_rate": 9.716107856436855e-06, "loss": 0.0302, "reward": 0.003772270865738392, "reward_std": 0.15810485929250717, "rewards/ndcg_rule_reward": -0.03333710506558418, "rewards/rule_reward": 0.037109375, "step": 445, "token_diversity": 0.3075132978723404 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.2703849651409518, "grad_norm": 1.9556264877319336, "kl": 49.875, "learning_rate": 9.714475587442822e-06, "loss": 0.0498, "reward": 0.003798395744524896, "reward_std": 0.11602151766419411, "rewards/ndcg_rule_reward": -0.023545355536043644, "rewards/rule_reward": 0.02734375, "step": 446, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.326171875, "epoch": 0.27099120945741134, "grad_norm": 1.569144368171692, "kl": 31.4375, "learning_rate": 9.712838777341596e-06, "loss": 0.0314, "reward": 0.0027895120438188314, "reward_std": 0.10810542106628418, "rewards/ndcg_rule_reward": -0.0226011136546731, "rewards/rule_reward": 0.025390625, "step": 447, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.27159745377387084, "grad_norm": 0.9129868745803833, "kl": 21.75, "learning_rate": 9.711197427709796e-06, "loss": 0.0217, "reward": 0.0019970136927440763, "reward_std": 0.06639276072382927, "rewards/ndcg_rule_reward": -0.01362798735499382, "rewards/rule_reward": 0.015625, "step": 448, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2722036980903304, "grad_norm": 3.5701956748962402, "kl": 34.25, "learning_rate": 9.709551540128415e-06, "loss": 0.0343, "reward": 0.004555937368422747, "reward_std": 0.1325158253312111, "rewards/ndcg_rule_reward": -0.026694064028561115, "rewards/rule_reward": 0.03125, "step": 449, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.27280994240678996, "grad_norm": 1.5955474376678467, "kl": 38.3125, "learning_rate": 9.707901116182814e-06, "loss": 0.0383, "reward": 0.004495913628488779, "reward_std": 0.14936305955052376, "rewards/ndcg_rule_reward": -0.030660336837172508, "rewards/rule_reward": 0.03515625, "step": 450, "token_diversity": 0.4098125 }, { "categorical_diversity": 1.0, "completion_length": 5.04296875, "epoch": 0.27341618672324947, "grad_norm": 1.0561985969543457, "kl": 19.8125, "learning_rate": 9.706246157462726e-06, "loss": 0.0198, "reward": 0.0025136357871815562, "reward_std": 0.0913965106010437, "rewards/ndcg_rule_reward": -0.018970739096403122, "rewards/rule_reward": 0.021484375, "step": 451, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.274022431039709, "grad_norm": 1.6295204162597656, "kl": 31.9375, "learning_rate": 9.70458666556225e-06, "loss": 0.0319, "reward": 0.004041828098706901, "reward_std": 0.13280856236815453, "rewards/ndcg_rule_reward": -0.027208171784877777, "rewards/rule_reward": 0.03125, "step": 452, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.02734375, "epoch": 0.27462867535616853, "grad_norm": 1.724489688873291, "kl": 17.875, "learning_rate": 9.702922642079851e-06, "loss": 0.0178, "reward": 0.0030131820822134614, "reward_std": 0.10800221562385559, "rewards/ndcg_rule_reward": -0.022377443499863148, "rewards/rule_reward": 0.025390625, "step": 453, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2752349196726281, "grad_norm": 1.7413750886917114, "kl": 17.0, "learning_rate": 9.701254088618363e-06, "loss": 0.017, "reward": 0.0034814837854355574, "reward_std": 0.13301077485084534, "rewards/ndcg_rule_reward": -0.027768516913056374, "rewards/rule_reward": 0.03125, "step": 454, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2758411639890876, "grad_norm": 1.3837764263153076, "kl": 15.40625, "learning_rate": 9.699581006784979e-06, "loss": 0.0154, "reward": 0.00276928604580462, "reward_std": 0.1081356443464756, "rewards/ndcg_rule_reward": -0.0226213401183486, "rewards/rule_reward": 0.025390625, "step": 455, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.27644740830554715, "grad_norm": 0.9876418113708496, "kl": 28.625, "learning_rate": 9.697903398191255e-06, "loss": 0.0286, "reward": 0.0024834731593728065, "reward_std": 0.07457897067070007, "rewards/ndcg_rule_reward": -0.015094651840627193, "rewards/rule_reward": 0.017578125, "step": 456, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.27705365262200665, "grad_norm": 2.522966146469116, "kl": 44.171875, "learning_rate": 9.69622126445311e-06, "loss": 0.0441, "reward": 0.002633075346238911, "reward_std": 0.08293041959404945, "rewards/ndcg_rule_reward": -0.016898173838853836, "rewards/rule_reward": 0.01953125, "step": 457, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2776598969384662, "grad_norm": 2.668727397918701, "kl": 40.125, "learning_rate": 9.694534607190818e-06, "loss": 0.0401, "reward": 0.0027607748052105308, "reward_std": 0.08288909494876862, "rewards/ndcg_rule_reward": -0.016770475544035435, "rewards/rule_reward": 0.01953125, "step": 458, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2782661412549257, "grad_norm": 1.6671639680862427, "kl": 34.0625, "learning_rate": 9.692843428029014e-06, "loss": 0.0341, "reward": 0.0040731162298470736, "reward_std": 0.14116358757019043, "rewards/ndcg_rule_reward": -0.029130009934306145, "rewards/rule_reward": 0.033203125, "step": 459, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.958984375, "epoch": 0.2788723855713853, "grad_norm": 1.6412358283996582, "kl": 26.875, "learning_rate": 9.691147728596682e-06, "loss": 0.0269, "reward": 0.004768656566739082, "reward_std": 0.1576712429523468, "rewards/ndcg_rule_reward": -0.03234071936458349, "rewards/rule_reward": 0.037109375, "step": 460, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.2794786298878448, "grad_norm": 1.4499480724334717, "kl": 32.25, "learning_rate": 9.689447510527167e-06, "loss": 0.0322, "reward": 0.0031780331628397107, "reward_std": 0.09948722273111343, "rewards/ndcg_rule_reward": -0.020259467884898186, "rewards/rule_reward": 0.0234375, "step": 461, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.28008487420430433, "grad_norm": 1.6356736421585083, "kl": 34.3125, "learning_rate": 9.687742775458164e-06, "loss": 0.0344, "reward": 0.003570115310139954, "reward_std": 0.12458331510424614, "rewards/ndcg_rule_reward": -0.025726759806275368, "rewards/rule_reward": 0.029296875, "step": 462, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2806911185207639, "grad_norm": 5.090761661529541, "kl": 49.6875, "learning_rate": 9.68603352503172e-06, "loss": 0.0496, "reward": 0.002637196332216263, "reward_std": 0.10817737132310867, "rewards/ndcg_rule_reward": -0.022753428667783737, "rewards/rule_reward": 0.025390625, "step": 463, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2812973628372234, "grad_norm": 1.8160477876663208, "kl": 66.625, "learning_rate": 9.684319760894227e-06, "loss": 0.0665, "reward": 0.0038961635436862707, "reward_std": 0.12442467361688614, "rewards/ndcg_rule_reward": -0.02540071215480566, "rewards/rule_reward": 0.029296875, "step": 464, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.28190360715368296, "grad_norm": 2.520477771759033, "kl": 71.5, "learning_rate": 9.682601484696429e-06, "loss": 0.0716, "reward": 0.0037514374125748873, "reward_std": 0.13289567828178406, "rewards/ndcg_rule_reward": -0.027498561888933182, "rewards/rule_reward": 0.03125, "step": 465, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.28250985147014246, "grad_norm": 1.682029128074646, "kl": 38.125, "learning_rate": 9.680878698093417e-06, "loss": 0.0382, "reward": 0.002770880935713649, "reward_std": 0.10813529044389725, "rewards/ndcg_rule_reward": -0.022619743831455708, "rewards/rule_reward": 0.025390625, "step": 466, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.283116095786602, "grad_norm": 1.9650782346725464, "kl": 33.375, "learning_rate": 9.679151402744622e-06, "loss": 0.0333, "reward": 0.003298679133877158, "reward_std": 0.11629663035273552, "rewards/ndcg_rule_reward": -0.024045070633292198, "rewards/rule_reward": 0.02734375, "step": 467, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2837223401030615, "grad_norm": 1.2002737522125244, "kl": 21.71875, "learning_rate": 9.67741960031382e-06, "loss": 0.0217, "reward": 0.002345685090404004, "reward_std": 0.07465151231735945, "rewards/ndcg_rule_reward": -0.015232440433464944, "rewards/rule_reward": 0.017578125, "step": 468, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2843285844195211, "grad_norm": 2.2076005935668945, "kl": 45.25, "learning_rate": 9.675683292469131e-06, "loss": 0.0453, "reward": 0.003521128324791789, "reward_std": 0.10777654871344566, "rewards/ndcg_rule_reward": -0.021869497373700142, "rewards/rule_reward": 0.025390625, "step": 469, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2849348287359806, "grad_norm": 2.30717396736145, "kl": 42.0, "learning_rate": 9.673942480883012e-06, "loss": 0.0421, "reward": 0.005068400176241994, "reward_std": 0.14911605417728424, "rewards/ndcg_rule_reward": -0.030087849125266075, "rewards/rule_reward": 0.03515625, "step": 470, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.232421875, "epoch": 0.28554107305244014, "grad_norm": 3.9967451095581055, "kl": 20.25, "learning_rate": 9.672197167232258e-06, "loss": 0.0203, "reward": 0.0034941743360832334, "reward_std": 0.11618757620453835, "rewards/ndcg_rule_reward": -0.023849576711654663, "rewards/rule_reward": 0.02734375, "step": 471, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.28614731736889965, "grad_norm": 2.2714314460754395, "kl": 12.796875, "learning_rate": 9.670447353198e-06, "loss": 0.0128, "reward": 0.002052133553661406, "reward_std": 0.09163399040699005, "rewards/ndcg_rule_reward": -0.019432242028415203, "rewards/rule_reward": 0.021484375, "step": 472, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2867535616853592, "grad_norm": 0.9551554322242737, "kl": 18.15625, "learning_rate": 9.668693040465708e-06, "loss": 0.0182, "reward": 0.0015762359835207462, "reward_std": 0.058160506188869476, "rewards/ndcg_rule_reward": -0.012095639016479254, "rewards/rule_reward": 0.013671875, "step": 473, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.2873598060018187, "grad_norm": 1.4615275859832764, "kl": 28.625, "learning_rate": 9.66693423072518e-06, "loss": 0.0286, "reward": 0.003475830890238285, "reward_std": 0.1077878549695015, "rewards/ndcg_rule_reward": -0.021914794109761715, "rewards/rule_reward": 0.025390625, "step": 474, "token_diversity": 0.3203125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.28796605031827827, "grad_norm": 2.837614059448242, "kl": 37.375, "learning_rate": 9.665170925670549e-06, "loss": 0.0373, "reward": 0.0038226695032790303, "reward_std": 0.12444432079792023, "rewards/ndcg_rule_reward": -0.02547420561313629, "rewards/rule_reward": 0.029296875, "step": 475, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2885722946347378, "grad_norm": 1.0360709428787231, "kl": 12.59375, "learning_rate": 9.663403127000274e-06, "loss": 0.0126, "reward": 0.0022603688994422555, "reward_std": 0.08313939161598682, "rewards/ndcg_rule_reward": -0.017270881682634354, "rewards/rule_reward": 0.01953125, "step": 476, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.28917853895119733, "grad_norm": 3.0895802974700928, "kl": 32.25, "learning_rate": 9.661630836417149e-06, "loss": 0.0322, "reward": 0.0026377711910754442, "reward_std": 0.09975072368979454, "rewards/ndcg_rule_reward": -0.020799729973077774, "rewards/rule_reward": 0.0234375, "step": 477, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2897847832676569, "grad_norm": 1.9102946519851685, "kl": 30.4375, "learning_rate": 9.65985405562829e-06, "loss": 0.0304, "reward": 0.003192956210114062, "reward_std": 0.11634579300880432, "rewards/ndcg_rule_reward": -0.024150792509317398, "rewards/rule_reward": 0.02734375, "step": 478, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2903910275841164, "grad_norm": 1.4725667238235474, "kl": 23.4375, "learning_rate": 9.658072786345139e-06, "loss": 0.0234, "reward": 0.0028146106051281095, "reward_std": 0.09966789931058884, "rewards/ndcg_rule_reward": -0.020622889511287212, "rewards/rule_reward": 0.0234375, "step": 479, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.29099727190057595, "grad_norm": 1.492108941078186, "kl": 32.8125, "learning_rate": 9.656287030283463e-06, "loss": 0.0328, "reward": 0.004228597739711404, "reward_std": 0.13269128277897835, "rewards/ndcg_rule_reward": -0.027021401561796665, "rewards/rule_reward": 0.03125, "step": 480, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.29160351621703545, "grad_norm": 2.2214839458465576, "kl": 27.03125, "learning_rate": 9.654496789163344e-06, "loss": 0.027, "reward": 0.004007032141089439, "reward_std": 0.1327819749712944, "rewards/ndcg_rule_reward": -0.02724296785891056, "rewards/rule_reward": 0.03125, "step": 481, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.017578125, "epoch": 0.292209760533495, "grad_norm": 1.00950026512146, "kl": 20.625, "learning_rate": 9.652702064709196e-06, "loss": 0.0206, "reward": 0.0026628620689734817, "reward_std": 0.09131424687802792, "rewards/ndcg_rule_reward": -0.01882151304744184, "rewards/rule_reward": 0.021484375, "step": 482, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2928160048499545, "grad_norm": 1.274051308631897, "kl": 32.875, "learning_rate": 9.65090285864974e-06, "loss": 0.0328, "reward": 0.0038963935803622007, "reward_std": 0.13281582295894623, "rewards/ndcg_rule_reward": -0.02735360711812973, "rewards/rule_reward": 0.03125, "step": 483, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2934222491664141, "grad_norm": 1.2755836248397827, "kl": 37.0, "learning_rate": 9.649099172718022e-06, "loss": 0.037, "reward": 0.0034437449648976326, "reward_std": 0.10780191421508789, "rewards/ndcg_rule_reward": -0.021946880035102367, "rewards/rule_reward": 0.025390625, "step": 484, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2940284934828736, "grad_norm": 1.8940871953964233, "kl": 36.75, "learning_rate": 9.647291008651398e-06, "loss": 0.0368, "reward": 0.0028950367122888565, "reward_std": 0.11646206304430962, "rewards/ndcg_rule_reward": -0.024448713287711143, "rewards/rule_reward": 0.02734375, "step": 485, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.29463473779933314, "grad_norm": 1.3991305828094482, "kl": 43.25, "learning_rate": 9.64547836819154e-06, "loss": 0.0432, "reward": 0.0024021276040002704, "reward_std": 0.09985354542732239, "rewards/ndcg_rule_reward": -0.021035373210906982, "rewards/rule_reward": 0.0234375, "step": 486, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.4296875, "epoch": 0.29524098211579264, "grad_norm": 1.384003758430481, "kl": 28.375, "learning_rate": 9.643661253084431e-06, "loss": 0.0284, "reward": 0.003792176488786936, "reward_std": 0.12445241957902908, "rewards/ndcg_rule_reward": -0.02550469897687435, "rewards/rule_reward": 0.029296875, "step": 487, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2958472264322522, "grad_norm": 1.2990254163742065, "kl": 13.96875, "learning_rate": 9.641839665080363e-06, "loss": 0.014, "reward": 0.002588752133306116, "reward_std": 0.0997701920568943, "rewards/ndcg_rule_reward": -0.020848747808486223, "rewards/rule_reward": 0.0234375, "step": 488, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.29645347074871176, "grad_norm": 1.7128058671951294, "kl": 18.25, "learning_rate": 9.64001360593394e-06, "loss": 0.0182, "reward": 0.003561120596714318, "reward_std": 0.12455346435308456, "rewards/ndcg_rule_reward": -0.02573575358837843, "rewards/rule_reward": 0.029296875, "step": 489, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.29705971506517126, "grad_norm": 3.1516733169555664, "kl": 43.625, "learning_rate": 9.63818307740407e-06, "loss": 0.0436, "reward": 0.004197969334200025, "reward_std": 0.14109386503696442, "rewards/ndcg_rule_reward": -0.029005154967308044, "rewards/rule_reward": 0.033203125, "step": 490, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2976659593816308, "grad_norm": 1.2865861654281616, "kl": 36.375, "learning_rate": 9.636348081253964e-06, "loss": 0.0364, "reward": 0.004015492741018534, "reward_std": 0.12435845285654068, "rewards/ndcg_rule_reward": -0.025281382724642754, "rewards/rule_reward": 0.029296875, "step": 491, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2982722036980903, "grad_norm": 1.318623423576355, "kl": 22.9375, "learning_rate": 9.63450861925114e-06, "loss": 0.0229, "reward": 0.0017724017379805446, "reward_std": 0.07489141076803207, "rewards/ndcg_rule_reward": -0.015805722679942846, "rewards/rule_reward": 0.017578125, "step": 492, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2988784480145499, "grad_norm": 2.8465683460235596, "kl": 37.0, "learning_rate": 9.632664693167417e-06, "loss": 0.0371, "reward": 0.004165046149864793, "reward_std": 0.16638518869876862, "rewards/ndcg_rule_reward": -0.03489745408296585, "rewards/rule_reward": 0.0390625, "step": 493, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.2994846923310094, "grad_norm": 1.2368685007095337, "kl": 27.5, "learning_rate": 9.630816304778912e-06, "loss": 0.0274, "reward": 0.0020333807915449142, "reward_std": 0.08320929855108261, "rewards/ndcg_rule_reward": -0.017497869674116373, "rewards/rule_reward": 0.01953125, "step": 494, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30009093664746894, "grad_norm": 1.2460418939590454, "kl": 29.1875, "learning_rate": 9.628963455866043e-06, "loss": 0.0293, "reward": 0.003192704636603594, "reward_std": 0.09948338940739632, "rewards/ndcg_rule_reward": -0.020244795363396406, "rewards/rule_reward": 0.0234375, "step": 495, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30069718096392845, "grad_norm": 2.343466281890869, "kl": 48.375, "learning_rate": 9.627106148213521e-06, "loss": 0.0484, "reward": 0.0027719272766262293, "reward_std": 0.09128499776124954, "rewards/ndcg_rule_reward": -0.018712447956204414, "rewards/rule_reward": 0.021484375, "step": 496, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.301303425280388, "grad_norm": 5.310041904449463, "kl": 56.75, "learning_rate": 9.625244383610355e-06, "loss": 0.0567, "reward": 0.002958222758024931, "reward_std": 0.0911964662373066, "rewards/ndcg_rule_reward": -0.018526152707636356, "rewards/rule_reward": 0.021484375, "step": 497, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3019096695968475, "grad_norm": 4.964474678039551, "kl": 42.125, "learning_rate": 9.623378163849847e-06, "loss": 0.0421, "reward": 0.002635199693031609, "reward_std": 0.0913209468126297, "rewards/ndcg_rule_reward": -0.018849174957722425, "rewards/rule_reward": 0.021484375, "step": 498, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30251591391330707, "grad_norm": 1.7294124364852905, "kl": 23.97265625, "learning_rate": 9.621507490729585e-06, "loss": 0.0239, "reward": 0.0036581960157491267, "reward_std": 0.1329309493303299, "rewards/ndcg_rule_reward": -0.027591804042458534, "rewards/rule_reward": 0.03125, "step": 499, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30312215822976657, "grad_norm": 1.5256692171096802, "kl": 31.6875, "learning_rate": 9.619632366051454e-06, "loss": 0.0316, "reward": 0.0022697914391756058, "reward_std": 0.07467563822865486, "rewards/ndcg_rule_reward": -0.015308333560824394, "rewards/rule_reward": 0.017578125, "step": 500, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30372840254622613, "grad_norm": 1.750448226928711, "kl": 40.375, "learning_rate": 9.617752791621624e-06, "loss": 0.0403, "reward": 0.0034731528721749783, "reward_std": 0.10775771364569664, "rewards/ndcg_rule_reward": -0.02191747259348631, "rewards/rule_reward": 0.025390625, "step": 501, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3043346468626857, "grad_norm": 0.9613886475563049, "kl": 35.375, "learning_rate": 9.615868769250547e-06, "loss": 0.0353, "reward": 0.0032824600348249078, "reward_std": 0.09105430915951729, "rewards/ndcg_rule_reward": -0.0182019155472517, "rewards/rule_reward": 0.021484375, "step": 502, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.130859375, "epoch": 0.3049408911791452, "grad_norm": 9.678998947143555, "kl": 66.8125, "learning_rate": 9.613980300752968e-06, "loss": 0.067, "reward": 0.00373527267947793, "reward_std": 0.12447202950716019, "rewards/ndcg_rule_reward": -0.025561602786183357, "rewards/rule_reward": 0.029296875, "step": 503, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30554713549560475, "grad_norm": 2.045030117034912, "kl": 36.75, "learning_rate": 9.612087387947905e-06, "loss": 0.0367, "reward": 0.002665516920387745, "reward_std": 0.09134289249777794, "rewards/ndcg_rule_reward": -0.018818858545273542, "rewards/rule_reward": 0.021484375, "step": 504, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 4.994140625, "epoch": 0.30615337981206425, "grad_norm": 2.501739978790283, "kl": 36.09375, "learning_rate": 9.610190032658664e-06, "loss": 0.0361, "reward": 0.0034469827660359442, "reward_std": 0.107801279053092, "rewards/ndcg_rule_reward": -0.021943642757833004, "rewards/rule_reward": 0.025390625, "step": 505, "token_diversity": 0.37893750000000004 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3067596241285238, "grad_norm": 1.7577725648880005, "kl": 25.9375, "learning_rate": 9.608288236712828e-06, "loss": 0.026, "reward": 0.0025053498684428632, "reward_std": 0.09982207790017128, "rewards/ndcg_rule_reward": -0.020932150073349476, "rewards/rule_reward": 0.0234375, "step": 506, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3073658684449833, "grad_norm": 1.4370685815811157, "kl": 24.875, "learning_rate": 9.606382001942256e-06, "loss": 0.0248, "reward": 0.0023434034665115178, "reward_std": 0.09148291498422623, "rewards/ndcg_rule_reward": -0.019140971824526787, "rewards/rule_reward": 0.021484375, "step": 507, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3079721127614429, "grad_norm": 2.5927932262420654, "kl": 54.375, "learning_rate": 9.604471330183082e-06, "loss": 0.0544, "reward": 0.0044284602627158165, "reward_std": 0.14938164502382278, "rewards/ndcg_rule_reward": -0.03072778880596161, "rewards/rule_reward": 0.03515625, "step": 508, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3085783570779024, "grad_norm": 1.4075607061386108, "kl": 35.0625, "learning_rate": 9.60255622327572e-06, "loss": 0.0351, "reward": 0.003285455983132124, "reward_std": 0.10788633301854134, "rewards/ndcg_rule_reward": -0.022105170413851738, "rewards/rule_reward": 0.025390625, "step": 509, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30918460139436194, "grad_norm": 2.7627456188201904, "kl": 34.03125, "learning_rate": 9.600636683064847e-06, "loss": 0.034, "reward": 0.003334860084578395, "reward_std": 0.10785968974232674, "rewards/ndcg_rule_reward": -0.02205576468259096, "rewards/rule_reward": 0.025390625, "step": 510, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.30979084571082144, "grad_norm": 1.7641066312789917, "kl": 41.3125, "learning_rate": 9.598712711399416e-06, "loss": 0.0413, "reward": 0.004239274887368083, "reward_std": 0.14110499620437622, "rewards/ndcg_rule_reward": -0.028963849879801273, "rewards/rule_reward": 0.033203125, "step": 511, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.310397090027281, "grad_norm": 0.9112984538078308, "kl": 43.75, "learning_rate": 9.596784310132649e-06, "loss": 0.0438, "reward": 0.0029529709136113524, "reward_std": 0.09118929877877235, "rewards/ndcg_rule_reward": -0.018531405366957188, "rewards/rule_reward": 0.021484375, "step": 512, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3110033343437405, "grad_norm": 1.9598129987716675, "kl": 2.078125, "learning_rate": 9.594851481122032e-06, "loss": 0.0021, "reward": 0.004747268743813038, "reward_std": 0.1493164524435997, "rewards/ndcg_rule_reward": -0.030408981256186962, "rewards/rule_reward": 0.03515625, "step": 513, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31160957866020006, "grad_norm": 1.8300822973251343, "kl": 11.21875, "learning_rate": 9.592914226229315e-06, "loss": 0.0113, "reward": 0.0026962633710354567, "reward_std": 0.10813464224338531, "rewards/ndcg_rule_reward": -0.022694361861795187, "rewards/rule_reward": 0.025390625, "step": 514, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.3122158229766596, "grad_norm": 2.0996596813201904, "kl": 7.140625, "learning_rate": 9.590972547320514e-06, "loss": 0.0071, "reward": 0.004516197252087295, "reward_std": 0.14092271029949188, "rewards/ndcg_rule_reward": -0.028686927631497383, "rewards/rule_reward": 0.033203125, "step": 515, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3128220672931191, "grad_norm": 1.7018095254898071, "kl": 8.671875, "learning_rate": 9.589026446265907e-06, "loss": 0.0087, "reward": 0.0025911658303812146, "reward_std": 0.11666036397218704, "rewards/ndcg_rule_reward": -0.024752583354711533, "rewards/rule_reward": 0.02734375, "step": 516, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3134283116095787, "grad_norm": 2.0019989013671875, "kl": 7.859375, "learning_rate": 9.587075924940027e-06, "loss": 0.0078, "reward": 0.0034986311802640557, "reward_std": 0.1161809116601944, "rewards/ndcg_rule_reward": -0.023845119401812553, "rewards/rule_reward": 0.02734375, "step": 517, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3140345559260382, "grad_norm": 2.3840560913085938, "kl": 24.0625, "learning_rate": 9.585120985221672e-06, "loss": 0.0241, "reward": 0.0033677942119538784, "reward_std": 0.11625877767801285, "rewards/ndcg_rule_reward": -0.02397595439106226, "rewards/rule_reward": 0.02734375, "step": 518, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31464080024249774, "grad_norm": 1.6695921421051025, "kl": 7.71875, "learning_rate": 9.583161628993887e-06, "loss": 0.0077, "reward": 0.0030167842051014304, "reward_std": 0.0995689183473587, "rewards/ndcg_rule_reward": -0.020420716144144535, "rewards/rule_reward": 0.0234375, "step": 519, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31524704455895725, "grad_norm": 2.1328847408294678, "kl": 28.9375, "learning_rate": 9.581197858143977e-06, "loss": 0.029, "reward": 0.002596725302282721, "reward_std": 0.0829700455069542, "rewards/ndcg_rule_reward": -0.016934525221586227, "rewards/rule_reward": 0.01953125, "step": 520, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.017578125, "epoch": 0.3158532888754168, "grad_norm": 1.5574710369110107, "kl": 16.53125, "learning_rate": 9.5792296745635e-06, "loss": 0.0165, "reward": 0.0032888357527554035, "reward_std": 0.10788662359118462, "rewards/ndcg_rule_reward": -0.02210178878158331, "rewards/rule_reward": 0.025390625, "step": 521, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3164595331918763, "grad_norm": 1.3526898622512817, "kl": 10.25, "learning_rate": 9.57725708014826e-06, "loss": 0.0103, "reward": 0.0029797368915751576, "reward_std": 0.08275476470589638, "rewards/ndcg_rule_reward": -0.016551513224840164, "rewards/rule_reward": 0.01953125, "step": 522, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31706577750833587, "grad_norm": 3.509141445159912, "kl": 34.625, "learning_rate": 9.57528007679831e-06, "loss": 0.0347, "reward": 0.0038219636771827936, "reward_std": 0.116045743227005, "rewards/ndcg_rule_reward": -0.023521785624325275, "rewards/rule_reward": 0.02734375, "step": 523, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31767202182479537, "grad_norm": 2.610586404800415, "kl": 28.8125, "learning_rate": 9.573298666417957e-06, "loss": 0.0288, "reward": 0.002140456752385944, "reward_std": 0.10840880498290062, "rewards/ndcg_rule_reward": -0.023250168189406395, "rewards/rule_reward": 0.025390625, "step": 524, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31827826614125493, "grad_norm": 1.9758137464523315, "kl": 38.6875, "learning_rate": 9.571312850915742e-06, "loss": 0.0387, "reward": 0.004319640342146158, "reward_std": 0.141047902405262, "rewards/ndcg_rule_reward": -0.02888348512351513, "rewards/rule_reward": 0.033203125, "step": 525, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.31888451045771443, "grad_norm": 1.6993845701217651, "kl": 34.75, "learning_rate": 9.569322632204458e-06, "loss": 0.0347, "reward": 0.0031975434976629913, "reward_std": 0.10791159048676491, "rewards/ndcg_rule_reward": -0.02219308167695999, "rewards/rule_reward": 0.025390625, "step": 526, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.319490754774174, "grad_norm": 2.1067821979522705, "kl": 22.84375, "learning_rate": 9.567328012201138e-06, "loss": 0.0229, "reward": 0.0025085133966058493, "reward_std": 0.09979058057069778, "rewards/ndcg_rule_reward": -0.02092898590490222, "rewards/rule_reward": 0.0234375, "step": 527, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32009699909063355, "grad_norm": 4.423366546630859, "kl": 30.5625, "learning_rate": 9.56532899282705e-06, "loss": 0.0306, "reward": 0.002889104769565165, "reward_std": 0.10803801193833351, "rewards/ndcg_rule_reward": -0.02250151988118887, "rewards/rule_reward": 0.025390625, "step": 528, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32070324340709305, "grad_norm": 1.5990551710128784, "kl": 20.71875, "learning_rate": 9.563325576007702e-06, "loss": 0.0207, "reward": 0.004543975810520351, "reward_std": 0.1493803896009922, "rewards/ndcg_rule_reward": -0.030612275004386902, "rewards/rule_reward": 0.03515625, "step": 529, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.474609375, "epoch": 0.3213094877235526, "grad_norm": 2.12130069732666, "kl": 31.0, "learning_rate": 9.561317763672838e-06, "loss": 0.0309, "reward": 0.0027645749505609274, "reward_std": 0.08285342529416084, "rewards/ndcg_rule_reward": -0.016766675747931004, "rewards/rule_reward": 0.01953125, "step": 530, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3219157320400121, "grad_norm": 3.201624870300293, "kl": 29.8125, "learning_rate": 9.559305557756438e-06, "loss": 0.0299, "reward": 0.00433929986320436, "reward_std": 0.14105477929115295, "rewards/ndcg_rule_reward": -0.028863824903964996, "rewards/rule_reward": 0.033203125, "step": 531, "token_diversity": 0.34765625 }, { "categorical_diversity": 1.0, "completion_length": 7.162109375, "epoch": 0.3225219763564717, "grad_norm": 1.1106078624725342, "kl": 12.1875, "learning_rate": 9.557288960196708e-06, "loss": 0.0122, "reward": 0.002019265084527433, "reward_std": 0.07478545233607292, "rewards/ndcg_rule_reward": -0.015558860264718533, "rewards/rule_reward": 0.017578125, "step": 532, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3231282206729312, "grad_norm": 5.258099555969238, "kl": 18.21875, "learning_rate": 9.55526797293609e-06, "loss": 0.0183, "reward": 0.0025786650367081165, "reward_std": 0.0913754478096962, "rewards/ndcg_rule_reward": -0.01890571042895317, "rewards/rule_reward": 0.021484375, "step": 533, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32373446498939074, "grad_norm": 0.9671241641044617, "kl": 7.5859375, "learning_rate": 9.553242597921254e-06, "loss": 0.0076, "reward": 0.002444704936351627, "reward_std": 0.07458799332380295, "rewards/ndcg_rule_reward": -0.015133420703932643, "rewards/rule_reward": 0.017578125, "step": 534, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 8.1328125, "epoch": 0.32434070930585024, "grad_norm": 2.359344244003296, "kl": 13.65625, "learning_rate": 9.551212837103092e-06, "loss": 0.0137, "reward": 0.0023334482684731483, "reward_std": 0.09990744292736053, "rewards/ndcg_rule_reward": -0.021104052662849426, "rewards/rule_reward": 0.0234375, "step": 535, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3249469536223098, "grad_norm": 1.123844027519226, "kl": 20.6875, "learning_rate": 9.549178692436725e-06, "loss": 0.0207, "reward": 0.0032133108470588923, "reward_std": 0.09947521984577179, "rewards/ndcg_rule_reward": -0.02022418938577175, "rewards/rule_reward": 0.0234375, "step": 536, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3255531979387693, "grad_norm": 1.6408145427703857, "kl": 29.0, "learning_rate": 9.547140165881492e-06, "loss": 0.029, "reward": 0.0031369151547551155, "reward_std": 0.0995333231985569, "rewards/ndcg_rule_reward": -0.02030058577656746, "rewards/rule_reward": 0.0234375, "step": 537, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32615944225522886, "grad_norm": 2.3981475830078125, "kl": 34.9375, "learning_rate": 9.545097259400958e-06, "loss": 0.0349, "reward": 0.00424225302413106, "reward_std": 0.12424849346280098, "rewards/ndcg_rule_reward": -0.025054622441530228, "rewards/rule_reward": 0.029296875, "step": 538, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32676568657168836, "grad_norm": 1.6181310415267944, "kl": 21.625, "learning_rate": 9.543049974962904e-06, "loss": 0.0216, "reward": 0.002668366825673729, "reward_std": 0.0913439616560936, "rewards/ndcg_rule_reward": -0.01881600823253393, "rewards/rule_reward": 0.021484375, "step": 539, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3273719308881479, "grad_norm": 2.1297712326049805, "kl": 32.5625, "learning_rate": 9.540998314539327e-06, "loss": 0.0325, "reward": 0.004079852136783302, "reward_std": 0.14955439418554306, "rewards/ndcg_rule_reward": -0.031076397746801376, "rewards/rule_reward": 0.03515625, "step": 540, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3279781752046075, "grad_norm": 1.9073894023895264, "kl": 28.6875, "learning_rate": 9.538942280106443e-06, "loss": 0.0287, "reward": 0.0035813262220472097, "reward_std": 0.11615246906876564, "rewards/ndcg_rule_reward": -0.023762423545122147, "rewards/rule_reward": 0.02734375, "step": 541, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.328584419521067, "grad_norm": 1.2602624893188477, "kl": 14.4375, "learning_rate": 9.536881873644676e-06, "loss": 0.0144, "reward": 0.002847215684596449, "reward_std": 0.08282498642802238, "rewards/ndcg_rule_reward": -0.016684033907949924, "rewards/rule_reward": 0.01953125, "step": 542, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32919066383752654, "grad_norm": 2.107250690460205, "kl": 26.4375, "learning_rate": 9.534817097138667e-06, "loss": 0.0265, "reward": 0.0024409047910012305, "reward_std": 0.06619073450565338, "rewards/ndcg_rule_reward": -0.013184095732867718, "rewards/rule_reward": 0.015625, "step": 543, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.32979690815398605, "grad_norm": 1.744662880897522, "kl": 22.3125, "learning_rate": 9.532747952577259e-06, "loss": 0.0223, "reward": 0.0035206512548029423, "reward_std": 0.11617282778024673, "rewards/ndcg_rule_reward": -0.023823099210858345, "rewards/rule_reward": 0.02734375, "step": 544, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3304031524704456, "grad_norm": 2.3449320793151855, "kl": 25.5, "learning_rate": 9.530674441953508e-06, "loss": 0.0255, "reward": 0.0038657470140606165, "reward_std": 0.14128277450799942, "rewards/ndcg_rule_reward": -0.029337377287447453, "rewards/rule_reward": 0.033203125, "step": 545, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3310093967869051, "grad_norm": 2.3367865085601807, "kl": 33.6875, "learning_rate": 9.528596567264676e-06, "loss": 0.0338, "reward": 0.0033626575022935867, "reward_std": 0.124650988727808, "rewards/ndcg_rule_reward": -0.025934217497706413, "rewards/rule_reward": 0.029296875, "step": 546, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33161564110336467, "grad_norm": 1.5397814512252808, "kl": 23.375, "learning_rate": 9.526514330512225e-06, "loss": 0.0234, "reward": 0.0027436281088739634, "reward_std": 0.09132226556539536, "rewards/ndcg_rule_reward": -0.01874074712395668, "rewards/rule_reward": 0.021484375, "step": 547, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.2109375, "epoch": 0.33222188541982417, "grad_norm": 1.4475377798080444, "kl": 15.46875, "learning_rate": 9.524427733701821e-06, "loss": 0.0154, "reward": 0.0027731070294976234, "reward_std": 0.10814215242862701, "rewards/ndcg_rule_reward": -0.02261751890182495, "rewards/rule_reward": 0.025390625, "step": 548, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33282812973628373, "grad_norm": 2.6083381175994873, "kl": 26.75, "learning_rate": 9.522336778843328e-06, "loss": 0.0267, "reward": 0.0029197955736890435, "reward_std": 0.12487956136465073, "rewards/ndcg_rule_reward": -0.026377080008387566, "rewards/rule_reward": 0.029296875, "step": 549, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33343437405274323, "grad_norm": 1.7484132051467896, "kl": 36.875, "learning_rate": 9.520241467950812e-06, "loss": 0.0368, "reward": 0.003998373751528561, "reward_std": 0.10753435268998146, "rewards/ndcg_rule_reward": -0.021392250433564186, "rewards/rule_reward": 0.025390625, "step": 550, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.75390625, "epoch": 0.3340406183692028, "grad_norm": 1.7325677871704102, "kl": 26.21875, "learning_rate": 9.518141803042528e-06, "loss": 0.0262, "reward": 0.003588999854400754, "reward_std": 0.10772841423749924, "rewards/ndcg_rule_reward": -0.021801626309752464, "rewards/rule_reward": 0.025390625, "step": 551, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33464686268566235, "grad_norm": 4.301514148712158, "kl": 37.375, "learning_rate": 9.516037786140929e-06, "loss": 0.0374, "reward": 0.0040821346919983625, "reward_std": 0.1159343533217907, "rewards/ndcg_rule_reward": -0.023261615075170994, "rewards/rule_reward": 0.02734375, "step": 552, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33525310700212185, "grad_norm": 1.6572215557098389, "kl": 28.0, "learning_rate": 9.513929419272662e-06, "loss": 0.0279, "reward": 0.004272955469787121, "reward_std": 0.14106611907482147, "rewards/ndcg_rule_reward": -0.02893016953021288, "rewards/rule_reward": 0.033203125, "step": 553, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3358593513185814, "grad_norm": 1.7306526899337769, "kl": 35.625, "learning_rate": 9.511816704468559e-06, "loss": 0.0357, "reward": 0.00331629638094455, "reward_std": 0.11627272516489029, "rewards/ndcg_rule_reward": -0.024027453735470772, "rewards/rule_reward": 0.02734375, "step": 554, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3364655956350409, "grad_norm": 1.4015440940856934, "kl": 27.3125, "learning_rate": 9.509699643763645e-06, "loss": 0.0274, "reward": 0.0030914288945496082, "reward_std": 0.09954328462481499, "rewards/ndcg_rule_reward": -0.02034607157111168, "rewards/rule_reward": 0.0234375, "step": 555, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3370718399515005, "grad_norm": 1.1447807550430298, "kl": 24.421875, "learning_rate": 9.507578239197126e-06, "loss": 0.0245, "reward": 0.002812716062180698, "reward_std": 0.09127030335366726, "rewards/ndcg_rule_reward": -0.018671658588573337, "rewards/rule_reward": 0.021484375, "step": 556, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33767808426796, "grad_norm": 1.186950445175171, "kl": 20.9375, "learning_rate": 9.505452492812398e-06, "loss": 0.0209, "reward": 0.002685261075384915, "reward_std": 0.0913405790925026, "rewards/ndcg_rule_reward": -0.018799114041030407, "rewards/rule_reward": 0.021484375, "step": 557, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33828432858441954, "grad_norm": 1.0721755027770996, "kl": 30.96875, "learning_rate": 9.503322406657031e-06, "loss": 0.031, "reward": 0.0028970579733140767, "reward_std": 0.09120915830135345, "rewards/ndcg_rule_reward": -0.018587318249046803, "rewards/rule_reward": 0.021484375, "step": 558, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.33889057290087904, "grad_norm": 1.5716841220855713, "kl": 32.25, "learning_rate": 9.501187982782785e-06, "loss": 0.0323, "reward": 0.0032827126560732722, "reward_std": 0.09102357923984528, "rewards/ndcg_rule_reward": -0.018201662227511406, "rewards/rule_reward": 0.021484375, "step": 559, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.287109375, "epoch": 0.3394968172173386, "grad_norm": 1.5676828622817993, "kl": 34.5, "learning_rate": 9.499049223245593e-06, "loss": 0.0345, "reward": 0.00424753432162106, "reward_std": 0.1242518238723278, "rewards/ndcg_rule_reward": -0.025049340911209583, "rewards/rule_reward": 0.029296875, "step": 560, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3401030615337981, "grad_norm": 1.805146336555481, "kl": 17.96875, "learning_rate": 9.496906130105562e-06, "loss": 0.018, "reward": 0.002822506125085056, "reward_std": 0.1080862358212471, "rewards/ndcg_rule_reward": -0.022568119689822197, "rewards/rule_reward": 0.025390625, "step": 561, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34070930585025766, "grad_norm": 1.7016032934188843, "kl": 17.34375, "learning_rate": 9.494758705426978e-06, "loss": 0.0173, "reward": 0.0023090909235179424, "reward_std": 0.0830951202660799, "rewards/ndcg_rule_reward": -0.017222159542143345, "rewards/rule_reward": 0.01953125, "step": 562, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34131555016671716, "grad_norm": 1.6812220811843872, "kl": 24.4375, "learning_rate": 9.492606951278294e-06, "loss": 0.0245, "reward": 0.0032378151081502438, "reward_std": 0.10789933800697327, "rewards/ndcg_rule_reward": -0.022152810357511044, "rewards/rule_reward": 0.025390625, "step": 563, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3419217944831767, "grad_norm": 1.6285988092422485, "kl": 29.4375, "learning_rate": 9.490450869732141e-06, "loss": 0.0295, "reward": 0.0026902765966951847, "reward_std": 0.09130783006548882, "rewards/ndcg_rule_reward": -0.018794098868966103, "rewards/rule_reward": 0.021484375, "step": 564, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3425280387996363, "grad_norm": 1.6340198516845703, "kl": 31.8125, "learning_rate": 9.48829046286531e-06, "loss": 0.0319, "reward": 0.0031232128385454416, "reward_std": 0.09110982343554497, "rewards/ndcg_rule_reward": -0.018361162394285202, "rewards/rule_reward": 0.021484375, "step": 565, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.583984375, "epoch": 0.3431342831160958, "grad_norm": 2.0976381301879883, "kl": 29.875, "learning_rate": 9.486125732758762e-06, "loss": 0.0298, "reward": 0.004618594772182405, "reward_std": 0.16618357598781586, "rewards/ndcg_rule_reward": -0.034443904645740986, "rewards/rule_reward": 0.0390625, "step": 566, "token_diversity": 0.3284451844262295 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34374052743255534, "grad_norm": 2.8346316814422607, "kl": 48.0625, "learning_rate": 9.483956681497624e-06, "loss": 0.048, "reward": 0.0031765246530994773, "reward_std": 0.09109440818428993, "rewards/ndcg_rule_reward": -0.018307850696146488, "rewards/rule_reward": 0.021484375, "step": 567, "token_diversity": 0.3671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34434677174901485, "grad_norm": 1.8646767139434814, "kl": 17.46875, "learning_rate": 9.481783311171183e-06, "loss": 0.0175, "reward": 0.0023090909817256033, "reward_std": 0.08309512585401535, "rewards/ndcg_rule_reward": -0.017222159076482058, "rewards/rule_reward": 0.01953125, "step": 568, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3449530160654744, "grad_norm": 1.0182245969772339, "kl": 22.8125, "learning_rate": 9.479605623872885e-06, "loss": 0.0228, "reward": 0.002334281394723803, "reward_std": 0.07465449720621109, "rewards/ndcg_rule_reward": -0.015243844594806433, "rewards/rule_reward": 0.017578125, "step": 569, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3455592603819339, "grad_norm": 4.063663959503174, "kl": 17.71875, "learning_rate": 9.477423621700338e-06, "loss": 0.0177, "reward": 0.004385545733384788, "reward_std": 0.13260472565889359, "rewards/ndcg_rule_reward": -0.02686445415019989, "rewards/rule_reward": 0.03125, "step": 570, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34616550469839347, "grad_norm": 1.593437671661377, "kl": 15.84375, "learning_rate": 9.475237306755303e-06, "loss": 0.0158, "reward": 0.003042673459276557, "reward_std": 0.10799328982830048, "rewards/ndcg_rule_reward": -0.0223479513078928, "rewards/rule_reward": 0.025390625, "step": 571, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34677174901485297, "grad_norm": 1.4517689943313599, "kl": 27.3125, "learning_rate": 9.473046681143694e-06, "loss": 0.0273, "reward": 0.0038142832927405834, "reward_std": 0.1412741094827652, "rewards/ndcg_rule_reward": -0.029388842172920704, "rewards/rule_reward": 0.033203125, "step": 572, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34737799333131253, "grad_norm": 1.4362196922302246, "kl": 13.625, "learning_rate": 9.470851746975582e-06, "loss": 0.0136, "reward": 0.0025455630384385586, "reward_std": 0.0829540342092514, "rewards/ndcg_rule_reward": -0.016985686495900154, "rewards/rule_reward": 0.01953125, "step": 573, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34798423764777203, "grad_norm": 1.4313244819641113, "kl": 24.3125, "learning_rate": 9.468652506365186e-06, "loss": 0.0243, "reward": 0.0034497531596571207, "reward_std": 0.12462709099054337, "rewards/ndcg_rule_reward": -0.025847122073173523, "rewards/rule_reward": 0.029296875, "step": 574, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3485904819642316, "grad_norm": 2.2958455085754395, "kl": 26.8125, "learning_rate": 9.466448961430872e-06, "loss": 0.0269, "reward": 0.004262711852788925, "reward_std": 0.1326429396867752, "rewards/ndcg_rule_reward": -0.026987288147211075, "rewards/rule_reward": 0.03125, "step": 575, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3491967262806911, "grad_norm": 1.4819543361663818, "kl": 46.8125, "learning_rate": 9.46424111429515e-06, "loss": 0.0467, "reward": 0.00407663022633642, "reward_std": 0.11590463668107986, "rewards/ndcg_rule_reward": -0.02326711919158697, "rewards/rule_reward": 0.02734375, "step": 576, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.34980297059715065, "grad_norm": 2.1364712715148926, "kl": 29.5, "learning_rate": 9.46202896708468e-06, "loss": 0.0295, "reward": 0.002639997284859419, "reward_std": 0.09132109209895134, "rewards/ndcg_rule_reward": -0.018844377249479294, "rewards/rule_reward": 0.021484375, "step": 577, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 6.48828125, "epoch": 0.3504092149136102, "grad_norm": 2.6232845783233643, "kl": 31.5, "learning_rate": 9.459812521930259e-06, "loss": 0.0315, "reward": 0.0037126000970602036, "reward_std": 0.11608782038092613, "rewards/ndcg_rule_reward": -0.023631148971617222, "rewards/rule_reward": 0.02734375, "step": 578, "token_diversity": 0.25948660714285715 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3510154592300697, "grad_norm": 2.250873565673828, "kl": 32.75, "learning_rate": 9.457591780966825e-06, "loss": 0.0327, "reward": 0.003023692639544606, "reward_std": 0.0995619110763073, "rewards/ndcg_rule_reward": -0.020413807593286037, "rewards/rule_reward": 0.0234375, "step": 579, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.1875, "epoch": 0.3516217035465293, "grad_norm": 5.657206058502197, "kl": 43.640625, "learning_rate": 9.455366746333454e-06, "loss": 0.0436, "reward": 0.004491522442549467, "reward_std": 0.14939452335238457, "rewards/ndcg_rule_reward": -0.030664727091789246, "rewards/rule_reward": 0.03515625, "step": 580, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3522279478629888, "grad_norm": 2.3627028465270996, "kl": 57.375, "learning_rate": 9.45313742017336e-06, "loss": 0.0574, "reward": 0.0029514117632061243, "reward_std": 0.09119003638625145, "rewards/ndcg_rule_reward": -0.018532964400947094, "rewards/rule_reward": 0.021484375, "step": 581, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35283419217944834, "grad_norm": 1.290804386138916, "kl": 38.4375, "learning_rate": 9.450903804633888e-06, "loss": 0.0384, "reward": 0.004044554429128766, "reward_std": 0.13277064263820648, "rewards/ndcg_rule_reward": -0.027205444872379303, "rewards/rule_reward": 0.03125, "step": 582, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35344043649590784, "grad_norm": 1.552663803100586, "kl": 27.6875, "learning_rate": 9.448665901866514e-06, "loss": 0.0277, "reward": 0.003225951222702861, "reward_std": 0.11629221588373184, "rewards/ndcg_rule_reward": -0.02411779947578907, "rewards/rule_reward": 0.02734375, "step": 583, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3540466808123674, "grad_norm": 1.512794852256775, "kl": 19.9375, "learning_rate": 9.446423714026846e-06, "loss": 0.0199, "reward": 0.003801652928814292, "reward_std": 0.132875956594944, "rewards/ndcg_rule_reward": -0.027448348701000214, "rewards/rule_reward": 0.03125, "step": 584, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3546529251288269, "grad_norm": 1.8421763181686401, "kl": 33.03125, "learning_rate": 9.444177243274619e-06, "loss": 0.033, "reward": 0.003906126832589507, "reward_std": 0.1496378369629383, "rewards/ndcg_rule_reward": -0.03125012293457985, "rewards/rule_reward": 0.03515625, "step": 585, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35525916944528646, "grad_norm": 1.5541465282440186, "kl": 18.4375, "learning_rate": 9.441926491773692e-06, "loss": 0.0184, "reward": 0.00321994093246758, "reward_std": 0.12472984567284584, "rewards/ndcg_rule_reward": -0.026076934300363064, "rewards/rule_reward": 0.029296875, "step": 586, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35586541376174596, "grad_norm": 1.3854684829711914, "kl": 20.21875, "learning_rate": 9.439671461692046e-06, "loss": 0.0202, "reward": 0.0034570731222629547, "reward_std": 0.10780017450451851, "rewards/ndcg_rule_reward": -0.021933551877737045, "rewards/rule_reward": 0.025390625, "step": 587, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3564716580782055, "grad_norm": 2.151334285736084, "kl": 25.75, "learning_rate": 9.437412155201791e-06, "loss": 0.0258, "reward": 0.004369061556644738, "reward_std": 0.14946455508470535, "rewards/ndcg_rule_reward": -0.030787188559770584, "rewards/rule_reward": 0.03515625, "step": 588, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.357077902394665, "grad_norm": 1.5251272916793823, "kl": 35.875, "learning_rate": 9.435148574479144e-06, "loss": 0.0359, "reward": 0.004131814697757363, "reward_std": 0.13270948454737663, "rewards/ndcg_rule_reward": -0.027118184603750706, "rewards/rule_reward": 0.03125, "step": 589, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3576841467111246, "grad_norm": 1.7556098699569702, "kl": 18.28125, "learning_rate": 9.43288072170445e-06, "loss": 0.0183, "reward": 0.0032835310557857156, "reward_std": 0.11628052219748497, "rewards/ndcg_rule_reward": -0.02406021859496832, "rewards/rule_reward": 0.02734375, "step": 590, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35829039102758414, "grad_norm": 1.7318283319473267, "kl": 45.125, "learning_rate": 9.430608599062167e-06, "loss": 0.045, "reward": 0.0029683057218790054, "reward_std": 0.09118665009737015, "rewards/ndcg_rule_reward": -0.018516069278120995, "rewards/rule_reward": 0.021484375, "step": 591, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.35889663534404365, "grad_norm": 1.6224547624588013, "kl": 8.453125, "learning_rate": 9.428332208740858e-06, "loss": 0.0084, "reward": 0.002204021904617548, "reward_std": 0.09155785292387009, "rewards/ndcg_rule_reward": -0.019280352629721165, "rewards/rule_reward": 0.021484375, "step": 592, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3595028796605032, "grad_norm": 3.395256280899048, "kl": 23.8125, "learning_rate": 9.426051552933205e-06, "loss": 0.0238, "reward": 0.0021438696421682835, "reward_std": 0.06631879508495331, "rewards/ndcg_rule_reward": -0.013481130823493004, "rewards/rule_reward": 0.015625, "step": 593, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.5, "epoch": 0.3601091239769627, "grad_norm": 1.0146225690841675, "kl": 28.125, "learning_rate": 9.423766633835994e-06, "loss": 0.0281, "reward": 0.0021347274305298924, "reward_std": 0.07475585862994194, "rewards/ndcg_rule_reward": -0.01544339768588543, "rewards/rule_reward": 0.017578125, "step": 594, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36071536829342227, "grad_norm": 1.5623764991760254, "kl": 25.8125, "learning_rate": 9.421477453650118e-06, "loss": 0.0258, "reward": 0.0025146104162558913, "reward_std": 0.08300364390015602, "rewards/ndcg_rule_reward": -0.017016639932990074, "rewards/rule_reward": 0.01953125, "step": 595, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36132161260988177, "grad_norm": 1.1911073923110962, "kl": 34.5, "learning_rate": 9.419184014580582e-06, "loss": 0.0345, "reward": 0.003402408678084612, "reward_std": 0.09938462823629379, "rewards/ndcg_rule_reward": -0.020035091321915388, "rewards/rule_reward": 0.0234375, "step": 596, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.251953125, "epoch": 0.36192785692634133, "grad_norm": 1.3366124629974365, "kl": 20.25, "learning_rate": 9.41688631883648e-06, "loss": 0.0203, "reward": 0.0036338408244773746, "reward_std": 0.12454359233379364, "rewards/ndcg_rule_reward": -0.025663034059107304, "rewards/rule_reward": 0.029296875, "step": 597, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36253410124280083, "grad_norm": 1.517688274383545, "kl": 13.046875, "learning_rate": 9.41458436863102e-06, "loss": 0.0131, "reward": 0.0023253419203683734, "reward_std": 0.09991877526044846, "rewards/ndcg_rule_reward": -0.021112157963216305, "rewards/rule_reward": 0.0234375, "step": 598, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3631403455592604, "grad_norm": 1.7731307744979858, "kl": 22.484375, "learning_rate": 9.412278166181492e-06, "loss": 0.0224, "reward": 0.0031201497185975313, "reward_std": 0.11637108027935028, "rewards/ndcg_rule_reward": -0.024223600514233112, "rewards/rule_reward": 0.02734375, "step": 599, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3637465898757199, "grad_norm": 1.9346678256988525, "kl": 29.5, "learning_rate": 9.4099677137093e-06, "loss": 0.0295, "reward": 0.002526668133214116, "reward_std": 0.09139347448945045, "rewards/ndcg_rule_reward": -0.018957707099616528, "rewards/rule_reward": 0.021484375, "step": 600, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36435283419217945, "grad_norm": 2.819342613220215, "kl": 38.4375, "learning_rate": 9.407653013439927e-06, "loss": 0.0384, "reward": 0.003629503771662712, "reward_std": 0.12453782558441162, "rewards/ndcg_rule_reward": -0.025667371228337288, "rewards/rule_reward": 0.029296875, "step": 601, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 8.603515625, "epoch": 0.36495907850863896, "grad_norm": 1.6156522035598755, "kl": 38.625, "learning_rate": 9.405334067602957e-06, "loss": 0.0386, "reward": 0.0036563295871019363, "reward_std": 0.0992760919034481, "rewards/ndcg_rule_reward": -0.019781170412898064, "rewards/rule_reward": 0.0234375, "step": 602, "token_diversity": 0.25073880784708247 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3655653228250985, "grad_norm": 1.881318211555481, "kl": 22.9375, "learning_rate": 9.403010878432059e-06, "loss": 0.023, "reward": 0.003673069877550006, "reward_std": 0.12451829388737679, "rewards/ndcg_rule_reward": -0.025623803958296776, "rewards/rule_reward": 0.029296875, "step": 603, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3661715671415581, "grad_norm": 1.1270312070846558, "kl": 14.671875, "learning_rate": 9.400683448164988e-06, "loss": 0.0147, "reward": 0.0021150679676793516, "reward_std": 0.06632547825574875, "rewards/ndcg_rule_reward": -0.013509932206943631, "rewards/rule_reward": 0.015625, "step": 604, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.3667778114580176, "grad_norm": 1.7213342189788818, "kl": 16.21875, "learning_rate": 9.398351779043587e-06, "loss": 0.0162, "reward": 0.002607073518447578, "reward_std": 0.08293836563825607, "rewards/ndcg_rule_reward": -0.016924177296459675, "rewards/rule_reward": 0.01953125, "step": 605, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 7.71875, "epoch": 0.36738405577447714, "grad_norm": 1.6169514656066895, "kl": 23.375, "learning_rate": 9.396015873313781e-06, "loss": 0.0234, "reward": 0.003124351380392909, "reward_std": 0.10796507447957993, "rewards/ndcg_rule_reward": -0.022266273386776447, "rewards/rule_reward": 0.025390625, "step": 606, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36799030009093664, "grad_norm": 1.5830743312835693, "kl": 34.375, "learning_rate": 9.393675733225578e-06, "loss": 0.0344, "reward": 0.002199361566454172, "reward_std": 0.0831235907971859, "rewards/ndcg_rule_reward": -0.017331889364868402, "rewards/rule_reward": 0.01953125, "step": 607, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3685965444073962, "grad_norm": 9.29353141784668, "kl": 59.5, "learning_rate": 9.391331361033061e-06, "loss": 0.0595, "reward": 0.003012773406226188, "reward_std": 0.12482483685016632, "rewards/ndcg_rule_reward": -0.026284100487828255, "rewards/rule_reward": 0.029296875, "step": 608, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3692027887238557, "grad_norm": 1.3658835887908936, "kl": 41.375, "learning_rate": 9.388982758994391e-06, "loss": 0.0413, "reward": 0.004059002152644098, "reward_std": 0.11591262370347977, "rewards/ndcg_rule_reward": -0.023284748196601868, "rewards/rule_reward": 0.02734375, "step": 609, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.36980903304031526, "grad_norm": 1.5908311605453491, "kl": 37.375, "learning_rate": 9.386629929371804e-06, "loss": 0.0373, "reward": 0.004211460007354617, "reward_std": 0.14112168550491333, "rewards/ndcg_rule_reward": -0.02899166475981474, "rewards/rule_reward": 0.033203125, "step": 610, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37041527735677476, "grad_norm": 1.3793776035308838, "kl": 15.298828125, "learning_rate": 9.384272874431608e-06, "loss": 0.0153, "reward": 0.0013270482304506004, "reward_std": 0.04990579932928085, "rewards/ndcg_rule_reward": -0.010391701944172382, "rewards/rule_reward": 0.01171875, "step": 611, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3710215216732343, "grad_norm": 1.746434211730957, "kl": 24.34375, "learning_rate": 9.381911596444178e-06, "loss": 0.0244, "reward": 0.004178425297141075, "reward_std": 0.124301228672266, "rewards/ndcg_rule_reward": -0.0251184506341815, "rewards/rule_reward": 0.029296875, "step": 612, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3716277659896938, "grad_norm": 1.207804560661316, "kl": 12.625, "learning_rate": 9.379546097683963e-06, "loss": 0.0126, "reward": 0.0021367662120610476, "reward_std": 0.07474616914987564, "rewards/ndcg_rule_reward": -0.015441358089447021, "rewards/rule_reward": 0.017578125, "step": 613, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3722340103061534, "grad_norm": 2.165106773376465, "kl": 28.125, "learning_rate": 9.37717638042947e-06, "loss": 0.0281, "reward": 0.0026829722337424755, "reward_std": 0.11655662208795547, "rewards/ndcg_rule_reward": -0.024660777300596237, "rewards/rule_reward": 0.02734375, "step": 614, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3728402546226129, "grad_norm": 3.087675094604492, "kl": 60.25, "learning_rate": 9.374802446963275e-06, "loss": 0.0603, "reward": 0.005338350776582956, "reward_std": 0.18265388160943985, "rewards/ndcg_rule_reward": -0.03763039968907833, "rewards/rule_reward": 0.04296875, "step": 615, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37344649893907245, "grad_norm": 2.2250430583953857, "kl": 49.625, "learning_rate": 9.372424299572014e-06, "loss": 0.0497, "reward": 0.003448628820478916, "reward_std": 0.10780194401741028, "rewards/ndcg_rule_reward": -0.021941996179521084, "rewards/rule_reward": 0.025390625, "step": 616, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.374052743255532, "grad_norm": 1.4241485595703125, "kl": 36.75, "learning_rate": 9.37004194054638e-06, "loss": 0.0367, "reward": 0.0030751540325582027, "reward_std": 0.09955035895109177, "rewards/ndcg_rule_reward": -0.02036234550178051, "rewards/rule_reward": 0.0234375, "step": 617, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3746589875719915, "grad_norm": 1.8996832370758057, "kl": 17.140625, "learning_rate": 9.367655372181124e-06, "loss": 0.0171, "reward": 0.0030840507824905217, "reward_std": 0.09954768046736717, "rewards/ndcg_rule_reward": -0.02035344857722521, "rewards/rule_reward": 0.0234375, "step": 618, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37526523188845107, "grad_norm": 0.9752687811851501, "kl": 14.53125, "learning_rate": 9.365264596775052e-06, "loss": 0.0145, "reward": 0.0023399971541948617, "reward_std": 0.06622390449047089, "rewards/ndcg_rule_reward": -0.013285002671182156, "rewards/rule_reward": 0.015625, "step": 619, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.029296875, "epoch": 0.37587147620491057, "grad_norm": 5.6993207931518555, "kl": 19.0625, "learning_rate": 9.362869616631022e-06, "loss": 0.0191, "reward": 0.0025186522980220616, "reward_std": 0.08299766108393669, "rewards/ndcg_rule_reward": -0.017012597993016243, "rewards/rule_reward": 0.01953125, "step": 620, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37647772052137013, "grad_norm": 1.7186686992645264, "kl": 35.21875, "learning_rate": 9.360470434055945e-06, "loss": 0.0352, "reward": 0.0033833676716312766, "reward_std": 0.09942019730806351, "rewards/ndcg_rule_reward": -0.02005413267761469, "rewards/rule_reward": 0.0234375, "step": 621, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37708396483782963, "grad_norm": 1.699004888534546, "kl": 23.5625, "learning_rate": 9.35806705136077e-06, "loss": 0.0235, "reward": 0.0032824601512402296, "reward_std": 0.09105431288480759, "rewards/ndcg_rule_reward": -0.018201914615929127, "rewards/rule_reward": 0.021484375, "step": 622, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3776902091542892, "grad_norm": 3.3133506774902344, "kl": 65.125, "learning_rate": 9.355659470860503e-06, "loss": 0.0652, "reward": 0.003534300602041185, "reward_std": 0.1077406033873558, "rewards/ndcg_rule_reward": -0.02185632474720478, "rewards/rule_reward": 0.025390625, "step": 623, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3782964534707487, "grad_norm": 2.043983221054077, "kl": 57.25, "learning_rate": 9.35324769487419e-06, "loss": 0.0573, "reward": 0.00395588856190443, "reward_std": 0.1243717335164547, "rewards/ndcg_rule_reward": -0.02534098643809557, "rewards/rule_reward": 0.029296875, "step": 624, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37890269778720825, "grad_norm": 3.3164727687835693, "kl": 36.3125, "learning_rate": 9.350831725724915e-06, "loss": 0.0364, "reward": 0.004736738745123148, "reward_std": 0.14082247018814087, "rewards/ndcg_rule_reward": -0.028466385789215565, "rewards/rule_reward": 0.033203125, "step": 625, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.37950894210366776, "grad_norm": 2.650468587875366, "kl": 34.46875, "learning_rate": 9.348411565739804e-06, "loss": 0.0345, "reward": 0.003358634072355926, "reward_std": 0.09100349247455597, "rewards/ndcg_rule_reward": -0.018125740811228752, "rewards/rule_reward": 0.021484375, "step": 626, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3801151864201273, "grad_norm": 5.800010681152344, "kl": 38.0625, "learning_rate": 9.345987217250018e-06, "loss": 0.0381, "reward": 0.003584524034522474, "reward_std": 0.1498045101761818, "rewards/ndcg_rule_reward": -0.03157172538340092, "rewards/rule_reward": 0.03515625, "step": 627, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3807214307365868, "grad_norm": 1.1377406120300293, "kl": 17.34375, "learning_rate": 9.343558682590757e-06, "loss": 0.0174, "reward": 0.0027893249643966556, "reward_std": 0.08284837007522583, "rewards/ndcg_rule_reward": -0.01674192538484931, "rewards/rule_reward": 0.01953125, "step": 628, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.3813276750530464, "grad_norm": 2.5449366569519043, "kl": 15.875, "learning_rate": 9.341125964101243e-06, "loss": 0.0159, "reward": 0.0014871663297526538, "reward_std": 0.05821993947029114, "rewards/ndcg_rule_reward": -0.012184708379209042, "rewards/rule_reward": 0.013671875, "step": 629, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38193391936950594, "grad_norm": 1.5121517181396484, "kl": 16.375, "learning_rate": 9.338689064124742e-06, "loss": 0.0164, "reward": 0.0019176641944795847, "reward_std": 0.07487952336668968, "rewards/ndcg_rule_reward": -0.01566046103835106, "rewards/rule_reward": 0.017578125, "step": 630, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38254016368596544, "grad_norm": 2.2469139099121094, "kl": 18.9375, "learning_rate": 9.336247985008533e-06, "loss": 0.019, "reward": 0.003350079641677439, "reward_std": 0.11626536026597023, "rewards/ndcg_rule_reward": -0.02399367094039917, "rewards/rule_reward": 0.02734375, "step": 631, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.383146408002425, "grad_norm": 1.5999655723571777, "kl": 25.6875, "learning_rate": 9.333802729103934e-06, "loss": 0.0257, "reward": 0.0028008471708744764, "reward_std": 0.10813703015446663, "rewards/ndcg_rule_reward": -0.02258977759629488, "rewards/rule_reward": 0.025390625, "step": 632, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3837526523188845, "grad_norm": 2.405780076980591, "kl": 35.125, "learning_rate": 9.331353298766278e-06, "loss": 0.0351, "reward": 0.0027609578100964427, "reward_std": 0.08285405300557613, "rewards/ndcg_rule_reward": -0.016770291840657592, "rewards/rule_reward": 0.01953125, "step": 633, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38435889663534406, "grad_norm": 1.7570196390151978, "kl": 10.109375, "learning_rate": 9.328899696354919e-06, "loss": 0.0101, "reward": 0.002504462725482881, "reward_std": 0.11665239930152893, "rewards/ndcg_rule_reward": -0.024839287623763084, "rewards/rule_reward": 0.02734375, "step": 634, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38496514095180356, "grad_norm": 1.4439867734909058, "kl": 18.375, "learning_rate": 9.326441924233232e-06, "loss": 0.0183, "reward": 0.0028208729345351458, "reward_std": 0.09970004111528397, "rewards/ndcg_rule_reward": -0.020616628229618073, "rewards/rule_reward": 0.0234375, "step": 635, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3855713852682631, "grad_norm": 1.9286831617355347, "kl": 28.59375, "learning_rate": 9.32397998476861e-06, "loss": 0.0285, "reward": 0.0036293267039582133, "reward_std": 0.12453867495059967, "rewards/ndcg_rule_reward": -0.025667548179626465, "rewards/rule_reward": 0.029296875, "step": 636, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3861776295847226, "grad_norm": 1.9330912828445435, "kl": 33.1875, "learning_rate": 9.321513880332458e-06, "loss": 0.0332, "reward": 0.0031710342736914754, "reward_std": 0.10792317986488342, "rewards/ndcg_rule_reward": -0.022219590842723846, "rewards/rule_reward": 0.025390625, "step": 637, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3867838739011822, "grad_norm": 1.4725762605667114, "kl": 13.0859375, "learning_rate": 9.31904361330019e-06, "loss": 0.0131, "reward": 0.002274320926517248, "reward_std": 0.09149491414427757, "rewards/ndcg_rule_reward": -0.019210053607821465, "rewards/rule_reward": 0.021484375, "step": 638, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3873901182176417, "grad_norm": 1.6981087923049927, "kl": 44.375, "learning_rate": 9.316569186051234e-06, "loss": 0.0444, "reward": 0.004613169934600592, "reward_std": 0.14932021498680115, "rewards/ndcg_rule_reward": -0.03054307959973812, "rewards/rule_reward": 0.03515625, "step": 639, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38799636253410125, "grad_norm": 3.1997921466827393, "kl": 26.1875, "learning_rate": 9.314090600969023e-06, "loss": 0.0262, "reward": 0.00287653726991266, "reward_std": 0.1080724187195301, "rewards/ndcg_rule_reward": -0.022514088079333305, "rewards/rule_reward": 0.025390625, "step": 640, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38860260685056075, "grad_norm": 1.3900493383407593, "kl": 13.78125, "learning_rate": 9.311607860440998e-06, "loss": 0.0137, "reward": 0.0019209763850085437, "reward_std": 0.0748482197523117, "rewards/ndcg_rule_reward": -0.015657149255275726, "rewards/rule_reward": 0.017578125, "step": 641, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.095703125, "epoch": 0.3892088511670203, "grad_norm": 1.5060380697250366, "kl": 21.25, "learning_rate": 9.309120966858595e-06, "loss": 0.0213, "reward": 0.0029500662349164486, "reward_std": 0.09963259845972061, "rewards/ndcg_rule_reward": -0.020487433299422264, "rewards/rule_reward": 0.0234375, "step": 642, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.38981509548347987, "grad_norm": 1.384479284286499, "kl": 21.5, "learning_rate": 9.306629922617262e-06, "loss": 0.0216, "reward": 0.0032697669230401516, "reward_std": 0.12474188581109047, "rewards/ndcg_rule_reward": -0.02602710761129856, "rewards/rule_reward": 0.029296875, "step": 643, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39042133979993937, "grad_norm": 2.6553730964660645, "kl": 35.375, "learning_rate": 9.304134730116435e-06, "loss": 0.0353, "reward": 0.005013959249481559, "reward_std": 0.1575341746211052, "rewards/ndcg_rule_reward": -0.03209541644901037, "rewards/rule_reward": 0.037109375, "step": 644, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39102758411639893, "grad_norm": 1.8612946271896362, "kl": 13.84375, "learning_rate": 9.301635391759546e-06, "loss": 0.0139, "reward": 0.002375470707193017, "reward_std": 0.0998687744140625, "rewards/ndcg_rule_reward": -0.021062029991298914, "rewards/rule_reward": 0.0234375, "step": 645, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39163382843285843, "grad_norm": 1.2362440824508667, "kl": 28.6875, "learning_rate": 9.29913190995403e-06, "loss": 0.0287, "reward": 0.0024999321904033422, "reward_std": 0.0745730847120285, "rewards/ndcg_rule_reward": -0.015078192576766014, "rewards/rule_reward": 0.017578125, "step": 646, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.392240072749318, "grad_norm": 2.4317846298217773, "kl": 40.625, "learning_rate": 9.296624287111302e-06, "loss": 0.0405, "reward": 0.0037306600715965033, "reward_std": 0.12451079115271568, "rewards/ndcg_rule_reward": -0.025566214695572853, "rewards/rule_reward": 0.029296875, "step": 647, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 7.162109375, "epoch": 0.3928463170657775, "grad_norm": 1.2250233888626099, "kl": 26.3125, "learning_rate": 9.294112525646772e-06, "loss": 0.0263, "reward": 0.003082538954913616, "reward_std": 0.0911191962659359, "rewards/ndcg_rule_reward": -0.018401836045086384, "rewards/rule_reward": 0.021484375, "step": 648, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39345256138223705, "grad_norm": 2.108881711959839, "kl": 28.4375, "learning_rate": 9.291596627979836e-06, "loss": 0.0285, "reward": 0.0034587070113047957, "reward_std": 0.14145099744200706, "rewards/ndcg_rule_reward": -0.02974441833794117, "rewards/rule_reward": 0.033203125, "step": 649, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39405880569869656, "grad_norm": 2.3787782192230225, "kl": 22.3125, "learning_rate": 9.289076596533873e-06, "loss": 0.0223, "reward": 0.002002980501856655, "reward_std": 0.06639164313673973, "rewards/ndcg_rule_reward": -0.013622019439935684, "rewards/rule_reward": 0.015625, "step": 650, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3946650500151561, "grad_norm": 2.9672558307647705, "kl": 33.875, "learning_rate": 9.28655243373624e-06, "loss": 0.0338, "reward": 0.0030381899559870362, "reward_std": 0.09956645965576172, "rewards/ndcg_rule_reward": -0.020399309694767, "rewards/rule_reward": 0.0234375, "step": 651, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3952712943316156, "grad_norm": 2.135063648223877, "kl": 31.6875, "learning_rate": 9.28402414201828e-06, "loss": 0.0316, "reward": 0.004133842419832945, "reward_std": 0.14114130288362503, "rewards/ndcg_rule_reward": -0.029069283045828342, "rewards/rule_reward": 0.033203125, "step": 652, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3958775386480752, "grad_norm": 45.28485870361328, "kl": 203.75, "learning_rate": 9.281491723815313e-06, "loss": 0.2031, "reward": 0.002737863454967737, "reward_std": 0.08285992592573166, "rewards/ndcg_rule_reward": -0.016793386545032263, "rewards/rule_reward": 0.01953125, "step": 653, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3964837829645347, "grad_norm": 1.853214144706726, "kl": 19.0, "learning_rate": 9.278955181566624e-06, "loss": 0.019, "reward": 0.003463016590103507, "reward_std": 0.10775972530245781, "rewards/ndcg_rule_reward": -0.021927607711404562, "rewards/rule_reward": 0.025390625, "step": 654, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39709002728099424, "grad_norm": 1.6427987813949585, "kl": 32.5625, "learning_rate": 9.276414517715484e-06, "loss": 0.0326, "reward": 0.004136134288273752, "reward_std": 0.11589070409536362, "rewards/ndcg_rule_reward": -0.023207616060972214, "rewards/rule_reward": 0.02734375, "step": 655, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3976962715974538, "grad_norm": 2.0508689880371094, "kl": 18.28125, "learning_rate": 9.273869734709124e-06, "loss": 0.0183, "reward": 0.0015460379654541612, "reward_std": 0.04977550730109215, "rewards/ndcg_rule_reward": -0.01017271215096116, "rewards/rule_reward": 0.01171875, "step": 656, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.3983025159139133, "grad_norm": 2.127263307571411, "kl": 40.625, "learning_rate": 9.271320834998747e-06, "loss": 0.0407, "reward": 0.0035882063675671816, "reward_std": 0.13298091292381287, "rewards/ndcg_rule_reward": -0.027661794796586037, "rewards/rule_reward": 0.03125, "step": 657, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39890876023037286, "grad_norm": 3.703655958175659, "kl": 55.0, "learning_rate": 9.268767821039522e-06, "loss": 0.0549, "reward": 0.002615694422274828, "reward_std": 0.09135733917355537, "rewards/ndcg_rule_reward": -0.01886868104338646, "rewards/rule_reward": 0.021484375, "step": 658, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.39951500454683236, "grad_norm": 1.319998860359192, "kl": 29.0625, "learning_rate": 9.266210695290574e-06, "loss": 0.0291, "reward": 0.003677265834994614, "reward_std": 0.12448794394731522, "rewards/ndcg_rule_reward": -0.025619609281420708, "rewards/rule_reward": 0.029296875, "step": 659, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4001212488632919, "grad_norm": 1.8786892890930176, "kl": 15.21875, "learning_rate": 9.263649460215e-06, "loss": 0.0152, "reward": 0.0019485410302877426, "reward_std": 0.0748400129377842, "rewards/ndcg_rule_reward": -0.015629583969712257, "rewards/rule_reward": 0.017578125, "step": 660, "token_diversity": 0.53125 }, { "epoch": 0.4001212488632919, "eval_categorical_diversity": 1.0, "eval_completion_length": 5.0, "eval_kl": 17.011443661971832, "eval_loss": 0.0169614739716053, "eval_reward": 0.0012725769201109946, "eval_reward_std": 0.04640591485609471, "eval_rewards/ndcg_rule_reward": -0.00963466349516956, "eval_rewards/rule_reward": 0.010907240316901408, "eval_runtime": 85.1048, "eval_samples_per_second": 53.252, "eval_steps_per_second": 0.059, "eval_token_diversity": 0.34463028169014087, "step": 660 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4007274931797514, "grad_norm": 2.340139865875244, "kl": 13.78125, "learning_rate": 9.261084118279846e-06, "loss": 0.0138, "reward": 0.002961155609227717, "reward_std": 0.09118783473968506, "rewards/ndcg_rule_reward": -0.018523219041526318, "rewards/rule_reward": 0.021484375, "step": 661, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.401333737496211, "grad_norm": 2.9467015266418457, "kl": 67.5625, "learning_rate": 9.258514671956119e-06, "loss": 0.0676, "reward": 0.003491802839562297, "reward_std": 0.10775332897901535, "rewards/ndcg_rule_reward": -0.02189882192760706, "rewards/rule_reward": 0.025390625, "step": 662, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4019399818126705, "grad_norm": 1.792694330215454, "kl": 60.875, "learning_rate": 9.255941123718775e-06, "loss": 0.0608, "reward": 0.003924879943951964, "reward_std": 0.11598692834377289, "rewards/ndcg_rule_reward": -0.023418869823217392, "rewards/rule_reward": 0.02734375, "step": 663, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40254622612913005, "grad_norm": 1.6622847318649292, "kl": 45.375, "learning_rate": 9.253363476046727e-06, "loss": 0.0453, "reward": 0.0038121279794722795, "reward_std": 0.13284194841980934, "rewards/ndcg_rule_reward": -0.02743787318468094, "rewards/rule_reward": 0.03125, "step": 664, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 8.603515625, "epoch": 0.40315247044558955, "grad_norm": 1.3391308784484863, "kl": 16.84375, "learning_rate": 9.250781731422829e-06, "loss": 0.0168, "reward": 0.003859490272589028, "reward_std": 0.12442649900913239, "rewards/ndcg_rule_reward": -0.025437384843826294, "rewards/rule_reward": 0.029296875, "step": 665, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4037587147620491, "grad_norm": 2.269493818283081, "kl": 34.25, "learning_rate": 9.248195892333887e-06, "loss": 0.0343, "reward": 0.002997505944222212, "reward_std": 0.11642428860068321, "rewards/ndcg_rule_reward": -0.0243462435901165, "rewards/rule_reward": 0.02734375, "step": 666, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4043649590785086, "grad_norm": 1.1613292694091797, "kl": 24.40625, "learning_rate": 9.24560596127065e-06, "loss": 0.0244, "reward": 0.0025697435485199094, "reward_std": 0.08294513821601868, "rewards/ndcg_rule_reward": -0.016961506567895412, "rewards/rule_reward": 0.01953125, "step": 667, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40497120339496817, "grad_norm": 2.1462507247924805, "kl": 22.375, "learning_rate": 9.243011940727808e-06, "loss": 0.0224, "reward": 0.0033491115318611264, "reward_std": 0.1330816075205803, "rewards/ndcg_rule_reward": -0.02790088951587677, "rewards/rule_reward": 0.03125, "step": 668, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40557744771142773, "grad_norm": 1.3397583961486816, "kl": 11.1875, "learning_rate": 9.24041383320399e-06, "loss": 0.0112, "reward": 0.002033907687291503, "reward_std": 0.08320837467908859, "rewards/ndcg_rule_reward": -0.017497343011200428, "rewards/rule_reward": 0.01953125, "step": 669, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40618369202788723, "grad_norm": 1.5814756155014038, "kl": 20.25, "learning_rate": 9.23781164120176e-06, "loss": 0.0202, "reward": 0.002875623875297606, "reward_std": 0.09121627174317837, "rewards/ndcg_rule_reward": -0.01860875147394836, "rewards/rule_reward": 0.021484375, "step": 670, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4067899363443468, "grad_norm": 2.1058473587036133, "kl": 24.5625, "learning_rate": 9.23520536722762e-06, "loss": 0.0245, "reward": 0.0030234711011871696, "reward_std": 0.09956935420632362, "rewards/ndcg_rule_reward": -0.020414029248058796, "rewards/rule_reward": 0.0234375, "step": 671, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4073961806608063, "grad_norm": 2.0237159729003906, "kl": 22.9375, "learning_rate": 9.232595013792004e-06, "loss": 0.023, "reward": 0.003297417249996215, "reward_std": 0.11629681661725044, "rewards/ndcg_rule_reward": -0.02404633304104209, "rewards/rule_reward": 0.02734375, "step": 672, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40800242497726585, "grad_norm": 1.0571229457855225, "kl": 19.671875, "learning_rate": 9.229980583409265e-06, "loss": 0.0196, "reward": 0.002775863336864859, "reward_std": 0.09127984568476677, "rewards/ndcg_rule_reward": -0.018708511721342802, "rewards/rule_reward": 0.021484375, "step": 673, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.40860866929372536, "grad_norm": 1.694273829460144, "kl": 6.890625, "learning_rate": 9.227362078597702e-06, "loss": 0.0069, "reward": 0.0019658621167764068, "reward_std": 0.0663989745080471, "rewards/ndcg_rule_reward": -0.013659138232469559, "rewards/rule_reward": 0.015625, "step": 674, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4092149136101849, "grad_norm": 2.364220380783081, "kl": 17.78125, "learning_rate": 9.224739501879518e-06, "loss": 0.0178, "reward": 0.0033664044458419085, "reward_std": 0.11628513410687447, "rewards/ndcg_rule_reward": -0.023977345786988735, "rewards/rule_reward": 0.02734375, "step": 675, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4098211579266444, "grad_norm": 1.39723539352417, "kl": 21.25, "learning_rate": 9.222112855780857e-06, "loss": 0.0213, "reward": 0.004477864480577409, "reward_std": 0.1325434297323227, "rewards/ndcg_rule_reward": -0.026772134006023407, "rewards/rule_reward": 0.03125, "step": 676, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.410427402243104, "grad_norm": 2.2694897651672363, "kl": 21.25, "learning_rate": 9.219482142831765e-06, "loss": 0.0213, "reward": 0.003599421586841345, "reward_std": 0.12455431371927261, "rewards/ndcg_rule_reward": -0.025697453878819942, "rewards/rule_reward": 0.029296875, "step": 677, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4110336465595635, "grad_norm": 5.2516703605651855, "kl": 48.25, "learning_rate": 9.21684736556622e-06, "loss": 0.0482, "reward": 0.004417917225509882, "reward_std": 0.13259341195225716, "rewards/ndcg_rule_reward": -0.02683208230882883, "rewards/rule_reward": 0.03125, "step": 678, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41163989087602304, "grad_norm": 4.1747517585754395, "kl": 52.0625, "learning_rate": 9.214208526522108e-06, "loss": 0.0521, "reward": 0.0034085488878190517, "reward_std": 0.1162308044731617, "rewards/ndcg_rule_reward": -0.023935201577842236, "rewards/rule_reward": 0.02734375, "step": 679, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41224613519248254, "grad_norm": 1.342751383781433, "kl": 41.125, "learning_rate": 9.211565628241228e-06, "loss": 0.0411, "reward": 0.002608386566862464, "reward_std": 0.08292930945754051, "rewards/ndcg_rule_reward": -0.016922864597290754, "rewards/rule_reward": 0.01953125, "step": 680, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4128523795089421, "grad_norm": 3.874565601348877, "kl": 69.375, "learning_rate": 9.20891867326929e-06, "loss": 0.0695, "reward": 0.003994259284809232, "reward_std": 0.11596449092030525, "rewards/ndcg_rule_reward": -0.023349490948021412, "rewards/rule_reward": 0.02734375, "step": 681, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41345862382540166, "grad_norm": 1.1827118396759033, "kl": 17.25, "learning_rate": 9.206267664155906e-06, "loss": 0.0172, "reward": 0.002388253342360258, "reward_std": 0.08303974196314812, "rewards/ndcg_rule_reward": -0.017142996191978455, "rewards/rule_reward": 0.01953125, "step": 682, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41406486814186116, "grad_norm": 1.7038532495498657, "kl": 52.75, "learning_rate": 9.203612603454605e-06, "loss": 0.0526, "reward": 0.004036139929667115, "reward_std": 0.12438094988465309, "rewards/ndcg_rule_reward": -0.025260737165808678, "rewards/rule_reward": 0.029296875, "step": 683, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4146711124583207, "grad_norm": 1.5477888584136963, "kl": 28.75, "learning_rate": 9.200953493722807e-06, "loss": 0.0287, "reward": 0.004218956222757697, "reward_std": 0.13268395885825157, "rewards/ndcg_rule_reward": -0.02703104354441166, "rewards/rule_reward": 0.03125, "step": 684, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4152773567747802, "grad_norm": 1.7372217178344727, "kl": 29.4375, "learning_rate": 9.198290337521837e-06, "loss": 0.0295, "reward": 0.0045806600246578455, "reward_std": 0.14093966037034988, "rewards/ndcg_rule_reward": -0.028622465208172798, "rewards/rule_reward": 0.033203125, "step": 685, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.4158836010912398, "grad_norm": 1.106420636177063, "kl": 20.375, "learning_rate": 9.19562313741692e-06, "loss": 0.0204, "reward": 0.002744299592450261, "reward_std": 0.09131673350930214, "rewards/ndcg_rule_reward": -0.018740076571702957, "rewards/rule_reward": 0.021484375, "step": 686, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4164898454076993, "grad_norm": 1.6802960634231567, "kl": 18.375, "learning_rate": 9.192951895977172e-06, "loss": 0.0184, "reward": 0.002502491290215403, "reward_std": 0.0830308310687542, "rewards/ndcg_rule_reward": -0.017028758767992258, "rewards/rule_reward": 0.01953125, "step": 687, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41709608972415885, "grad_norm": 1.241918683052063, "kl": 25.1875, "learning_rate": 9.1902766157756e-06, "loss": 0.0252, "reward": 0.0028682172996923327, "reward_std": 0.09121894091367722, "rewards/ndcg_rule_reward": -0.01861615665256977, "rewards/rule_reward": 0.021484375, "step": 688, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.9140625, "epoch": 0.41770233404061835, "grad_norm": 1.6804299354553223, "kl": 32.0625, "learning_rate": 9.187597299389108e-06, "loss": 0.0321, "reward": 0.003478261409327388, "reward_std": 0.11618475243449211, "rewards/ndcg_rule_reward": -0.023865489289164543, "rewards/rule_reward": 0.02734375, "step": 689, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4183085783570779, "grad_norm": 1.0118381977081299, "kl": 25.53125, "learning_rate": 9.184913949398485e-06, "loss": 0.0255, "reward": 0.002958796510938555, "reward_std": 0.08275887370109558, "rewards/ndcg_rule_reward": -0.016572454012930393, "rewards/rule_reward": 0.01953125, "step": 690, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4189148226735374, "grad_norm": 2.087291955947876, "kl": 55.0, "learning_rate": 9.1822265683884e-06, "loss": 0.0549, "reward": 0.0036037074751220644, "reward_std": 0.12454477325081825, "rewards/ndcg_rule_reward": -0.025693168863654137, "rewards/rule_reward": 0.029296875, "step": 691, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.41952106698999697, "grad_norm": 1.4477758407592773, "kl": 39.1875, "learning_rate": 9.179535158947415e-06, "loss": 0.0392, "reward": 0.003955635707825422, "reward_std": 0.12440245971083641, "rewards/ndcg_rule_reward": -0.025341239757835865, "rewards/rule_reward": 0.029296875, "step": 692, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4201273113064565, "grad_norm": 0.980963945388794, "kl": 23.0625, "learning_rate": 9.176839723667962e-06, "loss": 0.023, "reward": 0.0024409047327935696, "reward_std": 0.06619073078036308, "rewards/ndcg_rule_reward": -0.013184095732867718, "rewards/rule_reward": 0.015625, "step": 693, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.42073355562291603, "grad_norm": 2.9093730449676514, "kl": 39.5, "learning_rate": 9.174140265146356e-06, "loss": 0.0396, "reward": 0.0041761992033571005, "reward_std": 0.14112991467118263, "rewards/ndcg_rule_reward": -0.029026925563812256, "rewards/rule_reward": 0.033203125, "step": 694, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4213397999393756, "grad_norm": 2.0198495388031006, "kl": 8.71875, "learning_rate": 9.171436785982789e-06, "loss": 0.0087, "reward": 0.002833267441019416, "reward_std": 0.12487958371639252, "rewards/ndcg_rule_reward": -0.026463608257472515, "rewards/rule_reward": 0.029296875, "step": 695, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4219460442558351, "grad_norm": 3.414214611053467, "kl": 35.625, "learning_rate": 9.168729288781321e-06, "loss": 0.0356, "reward": 0.003750366158783436, "reward_std": 0.12450148910284042, "rewards/ndcg_rule_reward": -0.025546508841216564, "rewards/rule_reward": 0.029296875, "step": 696, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.42255228857229465, "grad_norm": 1.8940186500549316, "kl": 15.34375, "learning_rate": 9.166017776149888e-06, "loss": 0.0153, "reward": 0.0025074623990803957, "reward_std": 0.09982072189450264, "rewards/ndcg_rule_reward": -0.020930037833750248, "rewards/rule_reward": 0.0234375, "step": 697, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.01953125, "epoch": 0.42315853288875416, "grad_norm": 0.9772605299949646, "kl": 23.0625, "learning_rate": 9.163302250700285e-06, "loss": 0.0231, "reward": 0.0026609377237036824, "reward_std": 0.08291793242096901, "rewards/ndcg_rule_reward": -0.016870312858372927, "rewards/rule_reward": 0.01953125, "step": 698, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4237647772052137, "grad_norm": 2.04819393157959, "kl": 26.5, "learning_rate": 9.160582715048183e-06, "loss": 0.0265, "reward": 0.0038147418526932597, "reward_std": 0.11601771041750908, "rewards/ndcg_rule_reward": -0.02352900803089142, "rewards/rule_reward": 0.02734375, "step": 699, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4243710215216732, "grad_norm": 2.1426727771759033, "kl": 30.0625, "learning_rate": 9.157859171813107e-06, "loss": 0.03, "reward": 0.0030588534427806735, "reward_std": 0.10798320546746254, "rewards/ndcg_rule_reward": -0.02233177237212658, "rewards/rule_reward": 0.025390625, "step": 700, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.390625, "epoch": 0.4249772658381328, "grad_norm": 1.3256914615631104, "kl": 34.625, "learning_rate": 9.155131623618447e-06, "loss": 0.0347, "reward": 0.004059021477587521, "reward_std": 0.1327677071094513, "rewards/ndcg_rule_reward": -0.027190979570150375, "rewards/rule_reward": 0.03125, "step": 701, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4255835101545923, "grad_norm": 1.6160110235214233, "kl": 21.375, "learning_rate": 9.15240007309145e-06, "loss": 0.0214, "reward": 0.0031105474336072803, "reward_std": 0.11636850610375404, "rewards/ndcg_rule_reward": -0.024233203381299973, "rewards/rule_reward": 0.02734375, "step": 702, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.42618975447105184, "grad_norm": 2.091913938522339, "kl": 58.0, "learning_rate": 9.149664522863218e-06, "loss": 0.0579, "reward": 0.004742268472909927, "reward_std": 0.14086061716079712, "rewards/ndcg_rule_reward": -0.028460857458412647, "rewards/rule_reward": 0.033203125, "step": 703, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.42679599878751134, "grad_norm": 2.007103681564331, "kl": 38.6875, "learning_rate": 9.1469249755687e-06, "loss": 0.0387, "reward": 0.004065172513946891, "reward_std": 0.14115701615810394, "rewards/ndcg_rule_reward": -0.029137952253222466, "rewards/rule_reward": 0.033203125, "step": 704, "token_diversity": 0.35546875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4274022431039709, "grad_norm": 1.7628594636917114, "kl": 27.8125, "learning_rate": 9.144181433846707e-06, "loss": 0.0278, "reward": 0.0031852375250309706, "reward_std": 0.11630921065807343, "rewards/ndcg_rule_reward": -0.024158512242138386, "rewards/rule_reward": 0.02734375, "step": 705, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4280084874204304, "grad_norm": 1.324718952178955, "kl": 31.6875, "learning_rate": 9.141433900339887e-06, "loss": 0.0317, "reward": 0.003379985224455595, "reward_std": 0.13307218253612518, "rewards/ndcg_rule_reward": -0.027870014309883118, "rewards/rule_reward": 0.03125, "step": 706, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.42861473173688996, "grad_norm": 1.0487873554229736, "kl": 19.40625, "learning_rate": 9.138682377694737e-06, "loss": 0.0194, "reward": 0.0017553814686834812, "reward_std": 0.06650609523057938, "rewards/ndcg_rule_reward": -0.013869618996977806, "rewards/rule_reward": 0.015625, "step": 707, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4292209760533495, "grad_norm": 1.7740782499313354, "kl": 26.12109375, "learning_rate": 9.135926868561597e-06, "loss": 0.0261, "reward": 0.003870238200761378, "reward_std": 0.12443194910883904, "rewards/ndcg_rule_reward": -0.025426635518670082, "rewards/rule_reward": 0.029296875, "step": 708, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.429827220369809, "grad_norm": 1.123215913772583, "kl": 19.25, "learning_rate": 9.133167375594647e-06, "loss": 0.0192, "reward": 0.0028635768685489893, "reward_std": 0.09121965244412422, "rewards/ndcg_rule_reward": -0.01862079882994294, "rewards/rule_reward": 0.021484375, "step": 709, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4304334646862686, "grad_norm": 2.5210108757019043, "kl": 31.3125, "learning_rate": 9.1304039014519e-06, "loss": 0.0312, "reward": 0.003031390253454447, "reward_std": 0.0995674841105938, "rewards/ndcg_rule_reward": -0.020406109280884266, "rewards/rule_reward": 0.0234375, "step": 710, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4310397090027281, "grad_norm": 1.5970432758331299, "kl": 27.78125, "learning_rate": 9.127636448795212e-06, "loss": 0.0277, "reward": 0.0027566601638682187, "reward_std": 0.08288629539310932, "rewards/ndcg_rule_reward": -0.01677459036000073, "rewards/rule_reward": 0.01953125, "step": 711, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.43164595331918765, "grad_norm": 1.6626781225204468, "kl": 43.5, "learning_rate": 9.124865020290261e-06, "loss": 0.0435, "reward": 0.00431817548815161, "reward_std": 0.14105293154716492, "rewards/ndcg_rule_reward": -0.028884951025247574, "rewards/rule_reward": 0.033203125, "step": 712, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.041015625, "epoch": 0.43225219763564715, "grad_norm": 1.1670563220977783, "kl": 23.6875, "learning_rate": 9.122089618606563e-06, "loss": 0.0237, "reward": 0.0029224666068330407, "reward_std": 0.10806097835302353, "rewards/ndcg_rule_reward": -0.02246815711259842, "rewards/rule_reward": 0.025390625, "step": 713, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4328584419521067, "grad_norm": 1.490230679512024, "kl": 26.0, "learning_rate": 9.11931024641746e-06, "loss": 0.026, "reward": 0.00288845831528306, "reward_std": 0.09963230788707733, "rewards/ndcg_rule_reward": -0.020549042150378227, "rewards/rule_reward": 0.0234375, "step": 714, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4334646862685662, "grad_norm": 2.003410816192627, "kl": 39.875, "learning_rate": 9.11652690640011e-06, "loss": 0.0399, "reward": 0.0024606107035651803, "reward_std": 0.07461436465382576, "rewards/ndcg_rule_reward": -0.015117513481527567, "rewards/rule_reward": 0.017578125, "step": 715, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.43407093058502577, "grad_norm": 3.6395998001098633, "kl": 34.5625, "learning_rate": 9.113739601235508e-06, "loss": 0.0346, "reward": 0.003336459514684975, "reward_std": 0.09101065993309021, "rewards/ndcg_rule_reward": -0.018147915601730347, "rewards/rule_reward": 0.021484375, "step": 716, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4346771749014853, "grad_norm": 1.4230561256408691, "kl": 22.8125, "learning_rate": 9.110948333608452e-06, "loss": 0.0228, "reward": 0.0027497762348502874, "reward_std": 0.11654643341898918, "rewards/ndcg_rule_reward": -0.024593974463641644, "rewards/rule_reward": 0.02734375, "step": 717, "token_diversity": 0.36328125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.43528341921794483, "grad_norm": 1.7407065629959106, "kl": 15.5703125, "learning_rate": 9.10815310620757e-06, "loss": 0.0156, "reward": 0.0033518457785248756, "reward_std": 0.1162530779838562, "rewards/ndcg_rule_reward": -0.023991904221475124, "rewards/rule_reward": 0.02734375, "step": 718, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4358896635344044, "grad_norm": 1.644106388092041, "kl": 7.4375, "learning_rate": 9.105353921725298e-06, "loss": 0.0075, "reward": 0.00300004193559289, "reward_std": 0.11646288260817528, "rewards/ndcg_rule_reward": -0.024343707598745823, "rewards/rule_reward": 0.02734375, "step": 719, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4364959078508639, "grad_norm": 1.5208827257156372, "kl": 29.875, "learning_rate": 9.102550782857882e-06, "loss": 0.0299, "reward": 0.004102000501006842, "reward_std": 0.12429077178239822, "rewards/ndcg_rule_reward": -0.025194874964654446, "rewards/rule_reward": 0.029296875, "step": 720, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.43710215216732345, "grad_norm": 2.4923343658447266, "kl": 26.125, "learning_rate": 9.099743692305379e-06, "loss": 0.0261, "reward": 0.0033987092319875956, "reward_std": 0.1162470206618309, "rewards/ndcg_rule_reward": -0.023945041000843048, "rewards/rule_reward": 0.02734375, "step": 721, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.43770839648378296, "grad_norm": 1.9352055788040161, "kl": 28.125, "learning_rate": 9.096932652771657e-06, "loss": 0.0281, "reward": 0.00419998518191278, "reward_std": 0.14952202886343002, "rewards/ndcg_rule_reward": -0.03095626551657915, "rewards/rule_reward": 0.03515625, "step": 722, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4383146408002425, "grad_norm": 1.5702118873596191, "kl": 39.375, "learning_rate": 9.09411766696438e-06, "loss": 0.0393, "reward": 0.003168194612953812, "reward_std": 0.09952599555253983, "rewards/ndcg_rule_reward": -0.020269305910915136, "rewards/rule_reward": 0.0234375, "step": 723, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.438920885116702, "grad_norm": 1.7058032751083374, "kl": 27.125, "learning_rate": 9.091298737595015e-06, "loss": 0.0271, "reward": 0.002541419002227485, "reward_std": 0.09981102868914604, "rewards/ndcg_rule_reward": -0.020896080881357193, "rewards/rule_reward": 0.0234375, "step": 724, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4395271294331616, "grad_norm": 1.5300837755203247, "kl": 35.5, "learning_rate": 9.088475867378832e-06, "loss": 0.0355, "reward": 0.0026745452196337283, "reward_std": 0.082919891923666, "rewards/ndcg_rule_reward": -0.016856704838573933, "rewards/rule_reward": 0.01953125, "step": 725, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4401333737496211, "grad_norm": 1.5677493810653687, "kl": 40.75, "learning_rate": 9.085649059034894e-06, "loss": 0.0407, "reward": 0.004193562199361622, "reward_std": 0.13268915563821793, "rewards/ndcg_rule_reward": -0.027056436985731125, "rewards/rule_reward": 0.03125, "step": 726, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.44073961806608064, "grad_norm": 1.250524640083313, "kl": 18.59228515625, "learning_rate": 9.082818315286054e-06, "loss": 0.0185, "reward": 0.0027626805240288377, "reward_std": 0.09127955883741379, "rewards/ndcg_rule_reward": -0.018721694126725197, "rewards/rule_reward": 0.021484375, "step": 727, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.44134586238254014, "grad_norm": 1.3972965478897095, "kl": 13.59375, "learning_rate": 9.079983638858964e-06, "loss": 0.0136, "reward": 0.0033297305926680565, "reward_std": 0.11627014726400375, "rewards/ndcg_rule_reward": -0.024014019407331944, "rewards/rule_reward": 0.02734375, "step": 728, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 8.154296875, "epoch": 0.4419521066989997, "grad_norm": 1.7527023553848267, "kl": 35.875, "learning_rate": 9.077145032484057e-06, "loss": 0.0358, "reward": 0.004567376570776105, "reward_std": 0.15777453035116196, "rewards/ndcg_rule_reward": -0.03254199959337711, "rewards/rule_reward": 0.037109375, "step": 729, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4425583510154592, "grad_norm": 1.7183482646942139, "kl": 32.1875, "learning_rate": 9.074302498895553e-06, "loss": 0.0322, "reward": 0.0039912075735628605, "reward_std": 0.13278567045927048, "rewards/ndcg_rule_reward": -0.027258792892098427, "rewards/rule_reward": 0.03125, "step": 730, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.44316459533191876, "grad_norm": 2.84438419342041, "kl": 26.65625, "learning_rate": 9.071456040831454e-06, "loss": 0.0266, "reward": 0.003717629471793771, "reward_std": 0.14130454882979393, "rewards/ndcg_rule_reward": -0.029485495761036873, "rewards/rule_reward": 0.033203125, "step": 731, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4437708396483783, "grad_norm": 2.0603694915771484, "kl": 34.5, "learning_rate": 9.068605661033545e-06, "loss": 0.0344, "reward": 0.0041212516371160746, "reward_std": 0.13271605223417282, "rewards/ndcg_rule_reward": -0.02712875045835972, "rewards/rule_reward": 0.03125, "step": 732, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4443770839648378, "grad_norm": 1.6012226343154907, "kl": 42.75, "learning_rate": 9.065751362247389e-06, "loss": 0.0428, "reward": 0.004375543678179383, "reward_std": 0.1578572317957878, "rewards/ndcg_rule_reward": -0.03273383155465126, "rewards/rule_reward": 0.037109375, "step": 733, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4449833282812974, "grad_norm": 2.673433303833008, "kl": 23.625, "learning_rate": 9.062893147222316e-06, "loss": 0.0236, "reward": 0.0027193600544705987, "reward_std": 0.09972213208675385, "rewards/ndcg_rule_reward": -0.020718140061944723, "rewards/rule_reward": 0.0234375, "step": 734, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4455895725977569, "grad_norm": 1.487099528312683, "kl": 36.25, "learning_rate": 9.06003101871144e-06, "loss": 0.0363, "reward": 0.003908869810402393, "reward_std": 0.12442678213119507, "rewards/ndcg_rule_reward": -0.025388005189597607, "rewards/rule_reward": 0.029296875, "step": 735, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.44619581691421645, "grad_norm": 1.384933590888977, "kl": 47.625, "learning_rate": 9.057164979471636e-06, "loss": 0.0475, "reward": 0.0045903477584943175, "reward_std": 0.13250732421875, "rewards/ndcg_rule_reward": -0.026659652590751648, "rewards/rule_reward": 0.03125, "step": 736, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 6.216796875, "epoch": 0.44680206123067595, "grad_norm": 2.027777671813965, "kl": 45.0, "learning_rate": 9.05429503226355e-06, "loss": 0.045, "reward": 0.002097836579196155, "reward_std": 0.083157479763031, "rewards/ndcg_rule_reward": -0.017433414235711098, "rewards/rule_reward": 0.01953125, "step": 737, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.49609375, "epoch": 0.4474083055471355, "grad_norm": 1.4677320718765259, "kl": 18.78125, "learning_rate": 9.051421179851588e-06, "loss": 0.0188, "reward": 0.002079153899103403, "reward_std": 0.08316093683242798, "rewards/ndcg_rule_reward": -0.01745209563523531, "rewards/rule_reward": 0.01953125, "step": 738, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.448014549863595, "grad_norm": 1.4200832843780518, "kl": 29.0, "learning_rate": 9.048543425003924e-06, "loss": 0.029, "reward": 0.0034298361279070377, "reward_std": 0.11623445153236389, "rewards/ndcg_rule_reward": -0.02391391433775425, "rewards/rule_reward": 0.02734375, "step": 739, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.44862079418005457, "grad_norm": 5.289831638336182, "kl": 33.875, "learning_rate": 9.045661770492484e-06, "loss": 0.0338, "reward": 0.0026143951108679175, "reward_std": 0.0913577750325203, "rewards/ndcg_rule_reward": -0.018869981169700623, "rewards/rule_reward": 0.021484375, "step": 740, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4492270384965141, "grad_norm": 2.6670122146606445, "kl": 27.4375, "learning_rate": 9.042776219092956e-06, "loss": 0.0274, "reward": 0.004173900466412306, "reward_std": 0.12430437281727791, "rewards/ndcg_rule_reward": -0.025122974067926407, "rewards/rule_reward": 0.029296875, "step": 741, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.44983328281297363, "grad_norm": 2.001577615737915, "kl": 23.5, "learning_rate": 9.039886773584779e-06, "loss": 0.0235, "reward": 0.003732911078259349, "reward_std": 0.14132919907569885, "rewards/ndcg_rule_reward": -0.029470212757587433, "rewards/rule_reward": 0.033203125, "step": 742, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45043952712943314, "grad_norm": 2.203197479248047, "kl": 35.75, "learning_rate": 9.03699343675114e-06, "loss": 0.0358, "reward": 0.0037391046062111855, "reward_std": 0.11607608944177628, "rewards/ndcg_rule_reward": -0.02360464446246624, "rewards/rule_reward": 0.02734375, "step": 743, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4510457714458927, "grad_norm": 1.4712963104248047, "kl": 18.875, "learning_rate": 9.034096211378982e-06, "loss": 0.0188, "reward": 0.003332868218421936, "reward_std": 0.11629555746912956, "rewards/ndcg_rule_reward": -0.024010881781578064, "rewards/rule_reward": 0.02734375, "step": 744, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45165201576235225, "grad_norm": 2.190857410430908, "kl": 39.9375, "learning_rate": 9.031195100258988e-06, "loss": 0.0399, "reward": 0.003702797694131732, "reward_std": 0.11609034985303879, "rewards/ndcg_rule_reward": -0.023640952073037624, "rewards/rule_reward": 0.02734375, "step": 745, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45225826007881176, "grad_norm": 1.8595366477966309, "kl": 28.4375, "learning_rate": 9.02829010618558e-06, "loss": 0.0285, "reward": 0.0031043911585584283, "reward_std": 0.11633021757006645, "rewards/ndcg_rule_reward": -0.02423935942351818, "rewards/rule_reward": 0.02734375, "step": 746, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4528645043952713, "grad_norm": 2.176966905593872, "kl": 40.25, "learning_rate": 9.02538123195693e-06, "loss": 0.0402, "reward": 0.0033480223501101136, "reward_std": 0.10783354938030243, "rewards/ndcg_rule_reward": -0.02204260230064392, "rewards/rule_reward": 0.025390625, "step": 747, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4534707487117308, "grad_norm": 3.3970508575439453, "kl": 39.25, "learning_rate": 9.02246848037494e-06, "loss": 0.0393, "reward": 0.004049686482176185, "reward_std": 0.14116759598255157, "rewards/ndcg_rule_reward": -0.02915343828499317, "rewards/rule_reward": 0.033203125, "step": 748, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.02734375, "epoch": 0.4540769930281904, "grad_norm": 2.3678886890411377, "kl": 42.375, "learning_rate": 9.019551854245252e-06, "loss": 0.0424, "reward": 0.004698897944763303, "reward_std": 0.14927027374505997, "rewards/ndcg_rule_reward": -0.030457351356744766, "rewards/rule_reward": 0.03515625, "step": 749, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4546832373446499, "grad_norm": 1.9133403301239014, "kl": 48.75, "learning_rate": 9.016631356377233e-06, "loss": 0.0488, "reward": 0.004140836768783629, "reward_std": 0.1327183060348034, "rewards/ndcg_rule_reward": -0.027109162881970406, "rewards/rule_reward": 0.03125, "step": 750, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45528948166110944, "grad_norm": 2.163438558578491, "kl": 24.875, "learning_rate": 9.013706989583984e-06, "loss": 0.0249, "reward": 0.0036210615653544664, "reward_std": 0.12456917017698288, "rewards/ndcg_rule_reward": -0.025675813667476177, "rewards/rule_reward": 0.029296875, "step": 751, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45589572597756894, "grad_norm": 2.005448341369629, "kl": 31.75, "learning_rate": 9.010778756682336e-06, "loss": 0.0317, "reward": 0.0034548206022009254, "reward_std": 0.12461579963564873, "rewards/ndcg_rule_reward": -0.02584205474704504, "rewards/rule_reward": 0.029296875, "step": 752, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.171875, "epoch": 0.4565019702940285, "grad_norm": 1.4920872449874878, "kl": 52.3125, "learning_rate": 9.007846660492836e-06, "loss": 0.0522, "reward": 0.004242757568135858, "reward_std": 0.12425530329346657, "rewards/ndcg_rule_reward": -0.025054117664694786, "rewards/rule_reward": 0.029296875, "step": 753, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.457108214610488, "grad_norm": 2.1145527362823486, "kl": 43.0, "learning_rate": 9.00491070383976e-06, "loss": 0.043, "reward": 0.004040254396386445, "reward_std": 0.1159508153796196, "rewards/ndcg_rule_reward": -0.023303495720028877, "rewards/rule_reward": 0.02734375, "step": 754, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.45771445892694756, "grad_norm": 1.4745826721191406, "kl": 42.375, "learning_rate": 9.001970889551097e-06, "loss": 0.0423, "reward": 0.003448877250775695, "reward_std": 0.11619888246059418, "rewards/ndcg_rule_reward": -0.02389487251639366, "rewards/rule_reward": 0.02734375, "step": 755, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.052734375, "epoch": 0.45832070324340707, "grad_norm": 2.367595672607422, "kl": 53.0, "learning_rate": 8.999027220458551e-06, "loss": 0.0529, "reward": 0.004348635906353593, "reward_std": 0.14104287326335907, "rewards/ndcg_rule_reward": -0.0288544874638319, "rewards/rule_reward": 0.033203125, "step": 756, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4589269475598666, "grad_norm": 1.449830412864685, "kl": 22.25, "learning_rate": 8.996079699397547e-06, "loss": 0.0222, "reward": 0.0036308802664279938, "reward_std": 0.10771195217967033, "rewards/ndcg_rule_reward": -0.021759744733572006, "rewards/rule_reward": 0.025390625, "step": 757, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4595331918763262, "grad_norm": 1.4072515964508057, "kl": 18.7265625, "learning_rate": 8.993128329207212e-06, "loss": 0.0187, "reward": 0.002781143761239946, "reward_std": 0.09127605706453323, "rewards/ndcg_rule_reward": -0.018703231122344732, "rewards/rule_reward": 0.021484375, "step": 758, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4601394361927857, "grad_norm": 1.5833266973495483, "kl": 29.703125, "learning_rate": 8.990173112730384e-06, "loss": 0.0296, "reward": 0.004000580869615078, "reward_std": 0.13278228417038918, "rewards/ndcg_rule_reward": -0.027249419130384922, "rewards/rule_reward": 0.03125, "step": 759, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46074568050924525, "grad_norm": 1.7236926555633545, "kl": 32.0, "learning_rate": 8.987214052813605e-06, "loss": 0.032, "reward": 0.004422687226906419, "reward_std": 0.14938491582870483, "rewards/ndcg_rule_reward": -0.030733563005924225, "rewards/rule_reward": 0.03515625, "step": 760, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46135192482570475, "grad_norm": 1.1455069780349731, "kl": 19.6875, "learning_rate": 8.98425115230712e-06, "loss": 0.0197, "reward": 0.002734876819886267, "reward_std": 0.09129004180431366, "rewards/ndcg_rule_reward": -0.018749498762190342, "rewards/rule_reward": 0.021484375, "step": 761, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4619581691421643, "grad_norm": 1.7706925868988037, "kl": 30.625, "learning_rate": 8.981284414064873e-06, "loss": 0.0306, "reward": 0.0028822356835007668, "reward_std": 0.0996479019522667, "rewards/ndcg_rule_reward": -0.020555264316499233, "rewards/rule_reward": 0.0234375, "step": 762, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4625644134586238, "grad_norm": 9.56135082244873, "kl": 28.375, "learning_rate": 8.978313840944504e-06, "loss": 0.0284, "reward": 0.002337240439374, "reward_std": 0.07465327903628349, "rewards/ndcg_rule_reward": -0.015240884851664305, "rewards/rule_reward": 0.017578125, "step": 763, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46317065777508337, "grad_norm": 1.8797335624694824, "kl": 67.75, "learning_rate": 8.975339435807347e-06, "loss": 0.0677, "reward": 0.0034725740551948547, "reward_std": 0.09936634451150894, "rewards/ndcg_rule_reward": -0.01996492687612772, "rewards/rule_reward": 0.0234375, "step": 764, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4637769020915429, "grad_norm": 3.024855613708496, "kl": 38.40625, "learning_rate": 8.972361201518429e-06, "loss": 0.0384, "reward": 0.004022290464490652, "reward_std": 0.12434930726885796, "rewards/ndcg_rule_reward": -0.025274585001170635, "rewards/rule_reward": 0.029296875, "step": 765, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46438314640800243, "grad_norm": 1.4258016347885132, "kl": 38.8125, "learning_rate": 8.969379140946464e-06, "loss": 0.0388, "reward": 0.0036023298744112253, "reward_std": 0.09931974112987518, "rewards/ndcg_rule_reward": -0.01983516989275813, "rewards/rule_reward": 0.0234375, "step": 766, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46498939072446194, "grad_norm": 2.6571857929229736, "kl": 37.125, "learning_rate": 8.966393256963848e-06, "loss": 0.0372, "reward": 0.0032043871469795704, "reward_std": 0.09108191728591919, "rewards/ndcg_rule_reward": -0.018279988318681717, "rewards/rule_reward": 0.021484375, "step": 767, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4655956350409215, "grad_norm": 1.5628764629364014, "kl": 23.3125, "learning_rate": 8.963403552446667e-06, "loss": 0.0233, "reward": 0.0023715642746537924, "reward_std": 0.0914633497595787, "rewards/ndcg_rule_reward": -0.019112810492515564, "rewards/rule_reward": 0.021484375, "step": 768, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.466201879357381, "grad_norm": 1.9733772277832031, "kl": 40.75, "learning_rate": 8.960410030274681e-06, "loss": 0.0408, "reward": 0.004491918021813035, "reward_std": 0.1409727856516838, "rewards/ndcg_rule_reward": -0.02871120721101761, "rewards/rule_reward": 0.033203125, "step": 769, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46680812367384056, "grad_norm": 1.1943448781967163, "kl": 17.375, "learning_rate": 8.95741269333133e-06, "loss": 0.0174, "reward": 0.0019534691236913204, "reward_std": 0.07483820244669914, "rewards/ndcg_rule_reward": -0.01562465587630868, "rewards/rule_reward": 0.017578125, "step": 770, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0078125, "epoch": 0.4674143679903001, "grad_norm": 1.094350814819336, "kl": 19.59375, "learning_rate": 8.95441154450373e-06, "loss": 0.0196, "reward": 0.0023027234128676355, "reward_std": 0.0746612511575222, "rewards/ndcg_rule_reward": -0.01527540129609406, "rewards/rule_reward": 0.017578125, "step": 771, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4680206123067596, "grad_norm": 1.4484537839889526, "kl": 29.8125, "learning_rate": 8.951406586682663e-06, "loss": 0.0298, "reward": 0.00365480687469244, "reward_std": 0.1161014623939991, "rewards/ndcg_rule_reward": -0.02368894312530756, "rewards/rule_reward": 0.02734375, "step": 772, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4686268566232192, "grad_norm": 1.1939823627471924, "kl": 21.875, "learning_rate": 8.948397822762584e-06, "loss": 0.0219, "reward": 0.0029969390016049147, "reward_std": 0.09957703948020935, "rewards/ndcg_rule_reward": -0.020440561696887016, "rewards/rule_reward": 0.0234375, "step": 773, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4692331009396787, "grad_norm": 2.5039265155792236, "kl": 50.75, "learning_rate": 8.945385255641618e-06, "loss": 0.0507, "reward": 0.0028370990767143667, "reward_std": 0.0996549166738987, "rewards/ndcg_rule_reward": -0.020600400865077972, "rewards/rule_reward": 0.0234375, "step": 774, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.46983934525613824, "grad_norm": 3.010674476623535, "kl": 40.5, "learning_rate": 8.942368888221544e-06, "loss": 0.0404, "reward": 0.0044298984576016665, "reward_std": 0.14099132269620895, "rewards/ndcg_rule_reward": -0.028773225843906403, "rewards/rule_reward": 0.033203125, "step": 775, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47044558957259774, "grad_norm": 1.780813217163086, "kl": 28.375, "learning_rate": 8.93934872340781e-06, "loss": 0.0283, "reward": 0.00390697515103966, "reward_std": 0.12438610196113586, "rewards/ndcg_rule_reward": -0.02538990043103695, "rewards/rule_reward": 0.029296875, "step": 776, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4710518338890573, "grad_norm": 1.4741390943527222, "kl": 23.1875, "learning_rate": 8.936324764109518e-06, "loss": 0.0232, "reward": 0.0022138518397696316, "reward_std": 0.08312643598765135, "rewards/ndcg_rule_reward": -0.017317397869192064, "rewards/rule_reward": 0.01953125, "step": 777, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4716580782055168, "grad_norm": 1.5232653617858887, "kl": 32.25, "learning_rate": 8.933297013239423e-06, "loss": 0.0322, "reward": 0.0036986201303079724, "reward_std": 0.1076866164803505, "rewards/ndcg_rule_reward": -0.02169200498610735, "rewards/rule_reward": 0.025390625, "step": 778, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47226432252197637, "grad_norm": 3.0533969402313232, "kl": 22.375, "learning_rate": 8.930265473713939e-06, "loss": 0.0223, "reward": 0.0032517433864995837, "reward_std": 0.1078924722969532, "rewards/ndcg_rule_reward": -0.02213888242840767, "rewards/rule_reward": 0.025390625, "step": 779, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47287056683843587, "grad_norm": 1.3234635591506958, "kl": 15.1875, "learning_rate": 8.927230148453119e-06, "loss": 0.0152, "reward": 0.00267357868142426, "reward_std": 0.0913429856300354, "rewards/ndcg_rule_reward": -0.018810796551406384, "rewards/rule_reward": 0.021484375, "step": 780, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4734768111548954, "grad_norm": 1.8943406343460083, "kl": 19.75, "learning_rate": 8.924191040380672e-06, "loss": 0.0198, "reward": 0.0033441719133406878, "reward_std": 0.11622582003474236, "rewards/ndcg_rule_reward": -0.02399957925081253, "rewards/rule_reward": 0.02734375, "step": 781, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47408305547135493, "grad_norm": 1.3828343152999878, "kl": 56.875, "learning_rate": 8.921148152423946e-06, "loss": 0.0568, "reward": 0.005844099447131157, "reward_std": 0.16557950526475906, "rewards/ndcg_rule_reward": -0.03321839962154627, "rewards/rule_reward": 0.0390625, "step": 782, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4746892997878145, "grad_norm": 1.2457480430603027, "kl": 22.0625, "learning_rate": 8.918101487513932e-06, "loss": 0.0221, "reward": 0.0024907045299187303, "reward_std": 0.09140165150165558, "rewards/ndcg_rule_reward": -0.01899367105215788, "rewards/rule_reward": 0.021484375, "step": 783, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47529554410427405, "grad_norm": 1.4298442602157593, "kl": 36.9375, "learning_rate": 8.915051048585257e-06, "loss": 0.0369, "reward": 0.0032236905535683036, "reward_std": 0.09947337582707405, "rewards/ndcg_rule_reward": -0.020213808864355087, "rewards/rule_reward": 0.0234375, "step": 784, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47590178842073355, "grad_norm": 1.9178657531738281, "kl": 50.0625, "learning_rate": 8.911996838576182e-06, "loss": 0.0501, "reward": 0.004384560044854879, "reward_std": 0.1326056569814682, "rewards/ndcg_rule_reward": -0.026865439489483833, "rewards/rule_reward": 0.03125, "step": 785, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4765080327371931, "grad_norm": 2.1989665031433105, "kl": 20.71875, "learning_rate": 8.908938860428608e-06, "loss": 0.0207, "reward": 0.002524665789678693, "reward_std": 0.07459072768688202, "rewards/ndcg_rule_reward": -0.015053459908813238, "rewards/rule_reward": 0.017578125, "step": 786, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4771142770536526, "grad_norm": 1.2123275995254517, "kl": 17.140625, "learning_rate": 8.905877117088055e-06, "loss": 0.0172, "reward": 0.0020752232521772385, "reward_std": 0.07476979494094849, "rewards/ndcg_rule_reward": -0.015502901747822762, "rewards/rule_reward": 0.017578125, "step": 787, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.021484375, "epoch": 0.47772052137011217, "grad_norm": 1.6867960691452026, "kl": 64.125, "learning_rate": 8.902811611503678e-06, "loss": 0.064, "reward": 0.00533658592030406, "reward_std": 0.17426184564828873, "rewards/ndcg_rule_reward": -0.0356790404766798, "rewards/rule_reward": 0.041015625, "step": 788, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4783267656865717, "grad_norm": 1.798409104347229, "kl": 18.359375, "learning_rate": 8.899742346628253e-06, "loss": 0.0184, "reward": 0.002513694576919079, "reward_std": 0.09981917589902878, "rewards/ndcg_rule_reward": -0.02092380542308092, "rewards/rule_reward": 0.0234375, "step": 789, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47893301000303123, "grad_norm": 1.384252905845642, "kl": 29.4375, "learning_rate": 8.896669325418172e-06, "loss": 0.0295, "reward": 0.0028884073253721, "reward_std": 0.0996495708823204, "rewards/ndcg_rule_reward": -0.020549092441797256, "rewards/rule_reward": 0.0234375, "step": 790, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.47953925431949074, "grad_norm": 1.3872854709625244, "kl": 42.0, "learning_rate": 8.893592550833456e-06, "loss": 0.042, "reward": 0.0039941740687936544, "reward_std": 0.1243559867143631, "rewards/ndcg_rule_reward": -0.025302699767053127, "rewards/rule_reward": 0.029296875, "step": 791, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4801454986359503, "grad_norm": 1.488957166671753, "kl": 35.625, "learning_rate": 8.890512025837734e-06, "loss": 0.0356, "reward": 0.003012905130162835, "reward_std": 0.10799365490674973, "rewards/ndcg_rule_reward": -0.02237772010266781, "rewards/rule_reward": 0.025390625, "step": 792, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4807517429524098, "grad_norm": 2.1103286743164062, "kl": 29.1875, "learning_rate": 8.887427753398249e-06, "loss": 0.0292, "reward": 0.004327393136918545, "reward_std": 0.14945118874311447, "rewards/ndcg_rule_reward": -0.03082885779440403, "rewards/rule_reward": 0.03515625, "step": 793, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48135798726886936, "grad_norm": 1.493396520614624, "kl": 32.8125, "learning_rate": 8.88433973648585e-06, "loss": 0.0328, "reward": 0.002546637551859021, "reward_std": 0.09977832436561584, "rewards/ndcg_rule_reward": -0.020890862680971622, "rewards/rule_reward": 0.0234375, "step": 794, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48196423158532886, "grad_norm": 1.6531258821487427, "kl": 24.25, "learning_rate": 8.881247978074998e-06, "loss": 0.0243, "reward": 0.004118213430047035, "reward_std": 0.13272443413734436, "rewards/ndcg_rule_reward": -0.027131786569952965, "rewards/rule_reward": 0.03125, "step": 795, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4825704759017884, "grad_norm": 1.7333182096481323, "kl": 31.0625, "learning_rate": 8.87815248114376e-06, "loss": 0.0311, "reward": 0.003916444955393672, "reward_std": 0.13281790167093277, "rewards/ndcg_rule_reward": -0.02733355574309826, "rewards/rule_reward": 0.03125, "step": 796, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.483176720218248, "grad_norm": 2.1364479064941406, "kl": 41.75, "learning_rate": 8.875053248673795e-06, "loss": 0.0418, "reward": 0.004443885292857885, "reward_std": 0.13261624425649643, "rewards/ndcg_rule_reward": -0.026806115172803402, "rewards/rule_reward": 0.03125, "step": 797, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4837829645347075, "grad_norm": 2.1800551414489746, "kl": 20.4375, "learning_rate": 8.87195028365037e-06, "loss": 0.0205, "reward": 0.0034344870364293456, "reward_std": 0.11623654514551163, "rewards/ndcg_rule_reward": -0.02390926331281662, "rewards/rule_reward": 0.02734375, "step": 798, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48438920885116704, "grad_norm": 1.2375208139419556, "kl": 32.25, "learning_rate": 8.86884358906234e-06, "loss": 0.0322, "reward": 0.002521970309317112, "reward_std": 0.08295947685837746, "rewards/ndcg_rule_reward": -0.017009279690682888, "rewards/rule_reward": 0.01953125, "step": 799, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48499545316762654, "grad_norm": 1.195786714553833, "kl": 47.5, "learning_rate": 8.865733167902155e-06, "loss": 0.0475, "reward": 0.0030346496496349573, "reward_std": 0.09116819873452187, "rewards/ndcg_rule_reward": -0.018449725583195686, "rewards/rule_reward": 0.021484375, "step": 800, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4856016974840861, "grad_norm": 2.5228309631347656, "kl": 33.0625, "learning_rate": 8.862619023165856e-06, "loss": 0.033, "reward": 0.002482459880411625, "reward_std": 0.08300879970192909, "rewards/ndcg_rule_reward": -0.01704879105091095, "rewards/rule_reward": 0.01953125, "step": 801, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.017578125, "epoch": 0.4862079418005456, "grad_norm": 1.4825068712234497, "kl": 40.0625, "learning_rate": 8.859501157853067e-06, "loss": 0.04, "reward": 0.0032045929692685604, "reward_std": 0.10789847746491432, "rewards/ndcg_rule_reward": -0.022186032496392727, "rewards/rule_reward": 0.025390625, "step": 802, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48681418611700517, "grad_norm": 1.4638495445251465, "kl": 45.125, "learning_rate": 8.856379574967e-06, "loss": 0.0451, "reward": 0.0037988240364938974, "reward_std": 0.11602062731981277, "rewards/ndcg_rule_reward": -0.023544926196336746, "rewards/rule_reward": 0.02734375, "step": 803, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48742043043346467, "grad_norm": 1.5376973152160645, "kl": 12.8125, "learning_rate": 8.853254277514448e-06, "loss": 0.0128, "reward": 0.0026422993978485465, "reward_std": 0.09135031327605247, "rewards/ndcg_rule_reward": -0.018842075020074844, "rewards/rule_reward": 0.021484375, "step": 804, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4880266747499242, "grad_norm": 1.6639435291290283, "kl": 27.6875, "learning_rate": 8.850125268505775e-06, "loss": 0.0278, "reward": 0.004081588238477707, "reward_std": 0.13273292034864426, "rewards/ndcg_rule_reward": -0.027168411761522293, "rewards/rule_reward": 0.03125, "step": 805, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.48863291906638373, "grad_norm": 2.408979892730713, "kl": 36.125, "learning_rate": 8.846992550954928e-06, "loss": 0.0361, "reward": 0.004452587338164449, "reward_std": 0.12415097281336784, "rewards/ndcg_rule_reward": -0.024844287894666195, "rewards/rule_reward": 0.029296875, "step": 806, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4892391633828433, "grad_norm": 1.81276273727417, "kl": 16.90625, "learning_rate": 8.843856127879425e-06, "loss": 0.0169, "reward": 0.0030584702035412192, "reward_std": 0.09116169065237045, "rewards/ndcg_rule_reward": -0.018425905611366034, "rewards/rule_reward": 0.021484375, "step": 807, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4898454076993028, "grad_norm": 1.172965407371521, "kl": 18.4375, "learning_rate": 8.840716002300347e-06, "loss": 0.0184, "reward": 0.002113403403200209, "reward_std": 0.08314992859959602, "rewards/ndcg_rule_reward": -0.0174178471788764, "rewards/rule_reward": 0.01953125, "step": 808, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49045165201576235, "grad_norm": 3.2772598266601562, "kl": 76.9375, "learning_rate": 8.837572177242349e-06, "loss": 0.0769, "reward": 0.004076812183484435, "reward_std": 0.14119082689285278, "rewards/ndcg_rule_reward": -0.029126313515007496, "rewards/rule_reward": 0.033203125, "step": 809, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.4910578963322219, "grad_norm": 2.0885918140411377, "kl": 43.5, "learning_rate": 8.834424655733643e-06, "loss": 0.0435, "reward": 0.003739431966096163, "reward_std": 0.13289909064769745, "rewards/ndcg_rule_reward": -0.027510568499565125, "rewards/rule_reward": 0.03125, "step": 810, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4916641406486814, "grad_norm": 1.4742810726165771, "kl": 20.8125, "learning_rate": 8.83127344080601e-06, "loss": 0.0209, "reward": 0.0026598803815431893, "reward_std": 0.09974450618028641, "rewards/ndcg_rule_reward": -0.020777619443833828, "rewards/rule_reward": 0.0234375, "step": 811, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.492270384965141, "grad_norm": 15.25108814239502, "kl": 63.375, "learning_rate": 8.828118535494777e-06, "loss": 0.0634, "reward": 0.0034856265410780907, "reward_std": 0.10775505751371384, "rewards/ndcg_rule_reward": -0.02190499845892191, "rewards/rule_reward": 0.025390625, "step": 812, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.640625, "epoch": 0.4928766292816005, "grad_norm": 1.4135123491287231, "kl": 57.625, "learning_rate": 8.824959942838835e-06, "loss": 0.0577, "reward": 0.00442486722022295, "reward_std": 0.1241643875837326, "rewards/ndcg_rule_reward": -0.02487200777977705, "rewards/rule_reward": 0.029296875, "step": 813, "token_diversity": 0.25 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49348287359806003, "grad_norm": 1.4005117416381836, "kl": 37.75, "learning_rate": 8.821797665880626e-06, "loss": 0.0379, "reward": 0.003922200063243508, "reward_std": 0.12441715970635414, "rewards/ndcg_rule_reward": -0.025374675169587135, "rewards/rule_reward": 0.029296875, "step": 814, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49408911791451954, "grad_norm": 1.444445013999939, "kl": 24.96875, "learning_rate": 8.818631707666136e-06, "loss": 0.0249, "reward": 0.0025605368427932262, "reward_std": 0.09137725830078125, "rewards/ndcg_rule_reward": -0.01892383862286806, "rewards/rule_reward": 0.021484375, "step": 815, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4946953622309791, "grad_norm": 1.438676357269287, "kl": 27.1875, "learning_rate": 8.815462071244898e-06, "loss": 0.0272, "reward": 0.0032291484531015158, "reward_std": 0.09950266405940056, "rewards/ndcg_rule_reward": -0.020208352245390415, "rewards/rule_reward": 0.0234375, "step": 816, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4953016065474386, "grad_norm": 1.1092665195465088, "kl": 42.625, "learning_rate": 8.812288759669994e-06, "loss": 0.0426, "reward": 0.0032851401483640075, "reward_std": 0.09102313220500946, "rewards/ndcg_rule_reward": -0.018199236132204533, "rewards/rule_reward": 0.021484375, "step": 817, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49590785086389816, "grad_norm": 3.2298974990844727, "kl": 27.875, "learning_rate": 8.809111775998035e-06, "loss": 0.0278, "reward": 0.0037308288738131523, "reward_std": 0.1244751587510109, "rewards/ndcg_rule_reward": -0.025566047057509422, "rewards/rule_reward": 0.029296875, "step": 818, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49651409518035766, "grad_norm": 1.5480544567108154, "kl": 13.84375, "learning_rate": 8.805931123289182e-06, "loss": 0.0139, "reward": 0.0023954714415594935, "reward_std": 0.09142759442329407, "rewards/ndcg_rule_reward": -0.019088903442025185, "rewards/rule_reward": 0.021484375, "step": 819, "token_diversity": 0.3671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4971203394968172, "grad_norm": 1.7768642902374268, "kl": 23.1875, "learning_rate": 8.802746804607119e-06, "loss": 0.0232, "reward": 0.0031185683328658342, "reward_std": 0.12480100989341736, "rewards/ndcg_rule_reward": -0.02617830689996481, "rewards/rule_reward": 0.029296875, "step": 820, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4977265838132767, "grad_norm": 1.7595269680023193, "kl": 25.9375, "learning_rate": 8.799558823019068e-06, "loss": 0.0259, "reward": 0.004373828181996942, "reward_std": 0.1410396695137024, "rewards/ndcg_rule_reward": -0.028829297050833702, "rewards/rule_reward": 0.033203125, "step": 821, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.4983328281297363, "grad_norm": 6.805373668670654, "kl": 46.375, "learning_rate": 8.796367181595775e-06, "loss": 0.0464, "reward": 0.002017658844124526, "reward_std": 0.07482220605015755, "rewards/ndcg_rule_reward": -0.015560466796159744, "rewards/rule_reward": 0.017578125, "step": 822, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49893907244619584, "grad_norm": 1.792048454284668, "kl": 30.1875, "learning_rate": 8.793171883411515e-06, "loss": 0.0302, "reward": 0.001975711726117879, "reward_std": 0.06639702618122101, "rewards/ndcg_rule_reward": -0.013649288564920425, "rewards/rule_reward": 0.015625, "step": 823, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.49954531676265534, "grad_norm": 3.677690267562866, "kl": 38.875, "learning_rate": 8.789972931544081e-06, "loss": 0.0389, "reward": 0.0019988660351373255, "reward_std": 0.0664227195084095, "rewards/ndcg_rule_reward": -0.013626134023070335, "rewards/rule_reward": 0.015625, "step": 824, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5001515610791148, "grad_norm": 2.5889108180999756, "kl": 34.4375, "learning_rate": 8.786770329074794e-06, "loss": 0.0345, "reward": 0.002322850516065955, "reward_std": 0.07469094544649124, "rewards/ndcg_rule_reward": -0.015255275648087263, "rewards/rule_reward": 0.017578125, "step": 825, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5007578053955745, "grad_norm": 1.9273864030838013, "kl": 21.0625, "learning_rate": 8.783564079088478e-06, "loss": 0.021, "reward": 0.0032243571477010846, "reward_std": 0.10790062323212624, "rewards/ndcg_rule_reward": -0.022166268900036812, "rewards/rule_reward": 0.025390625, "step": 826, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.501364049712034, "grad_norm": 1.549448013305664, "kl": 34.1875, "learning_rate": 8.780354184673478e-06, "loss": 0.0342, "reward": 0.0042365791741758585, "reward_std": 0.13267936557531357, "rewards/ndcg_rule_reward": -0.02701342012733221, "rewards/rule_reward": 0.03125, "step": 827, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5019702940284935, "grad_norm": 1.357468605041504, "kl": 31.5, "learning_rate": 8.777140648921653e-06, "loss": 0.0315, "reward": 0.0039207651279866695, "reward_std": 0.11598413065075874, "rewards/ndcg_rule_reward": -0.023422985337674618, "rewards/rule_reward": 0.02734375, "step": 828, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.502576538344953, "grad_norm": 1.3894011974334717, "kl": 22.21875, "learning_rate": 8.773923474928365e-06, "loss": 0.0222, "reward": 0.0024312856839969754, "reward_std": 0.07459114864468575, "rewards/ndcg_rule_reward": -0.015146840363740921, "rewards/rule_reward": 0.017578125, "step": 829, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5031827826614126, "grad_norm": 2.7392914295196533, "kl": 22.25, "learning_rate": 8.770702665792478e-06, "loss": 0.0222, "reward": 0.0028973007574677467, "reward_std": 0.09963930770754814, "rewards/ndcg_rule_reward": -0.020540200173854828, "rewards/rule_reward": 0.0234375, "step": 830, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 7.162109375, "epoch": 0.5037890269778721, "grad_norm": 1.7804477214813232, "kl": 28.3125, "learning_rate": 8.767478224616363e-06, "loss": 0.0283, "reward": 0.004043979570269585, "reward_std": 0.13280104845762253, "rewards/ndcg_rule_reward": -0.027206020895391703, "rewards/rule_reward": 0.03125, "step": 831, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.5043952712943316, "grad_norm": 1.7612426280975342, "kl": 34.5, "learning_rate": 8.764250154505884e-06, "loss": 0.0345, "reward": 0.003522329614497721, "reward_std": 0.14143703877925873, "rewards/ndcg_rule_reward": -0.02968079410493374, "rewards/rule_reward": 0.033203125, "step": 832, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5050015156107911, "grad_norm": 1.5237904787063599, "kl": 26.5625, "learning_rate": 8.761018458570406e-06, "loss": 0.0266, "reward": 0.0028504872461780906, "reward_std": 0.11647459864616394, "rewards/ndcg_rule_reward": -0.024493263103067875, "rewards/rule_reward": 0.02734375, "step": 833, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5056077599272507, "grad_norm": 1.4624730348587036, "kl": 27.1201171875, "learning_rate": 8.75778313992278e-06, "loss": 0.0272, "reward": 0.004054821794852614, "reward_std": 0.14119026809930801, "rewards/ndcg_rule_reward": -0.029148302972316742, "rewards/rule_reward": 0.033203125, "step": 834, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5062140042437102, "grad_norm": 1.5047866106033325, "kl": 44.125, "learning_rate": 8.754544201679354e-06, "loss": 0.044, "reward": 0.0034897675504907966, "reward_std": 0.11618191748857498, "rewards/ndcg_rule_reward": -0.02385398279875517, "rewards/rule_reward": 0.02734375, "step": 835, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5068202485601697, "grad_norm": 1.4735636711120605, "kl": 40.0625, "learning_rate": 8.751301646959958e-06, "loss": 0.0401, "reward": 0.003862190991640091, "reward_std": 0.12443181872367859, "rewards/ndcg_rule_reward": -0.025434683542698622, "rewards/rule_reward": 0.029296875, "step": 836, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5074264928766293, "grad_norm": 1.4083030223846436, "kl": 47.0, "learning_rate": 8.748055478887905e-06, "loss": 0.047, "reward": 0.003537084790877998, "reward_std": 0.10774007439613342, "rewards/ndcg_rule_reward": -0.02185354009270668, "rewards/rule_reward": 0.025390625, "step": 837, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0859375, "epoch": 0.5080327371930888, "grad_norm": 2.5309205055236816, "kl": 29.125, "learning_rate": 8.744805700589989e-06, "loss": 0.0291, "reward": 0.002706354367546737, "reward_std": 0.09132668003439903, "rewards/ndcg_rule_reward": -0.018778021447360516, "rewards/rule_reward": 0.021484375, "step": 838, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.5086389815095483, "grad_norm": 1.1085189580917358, "kl": 32.8125, "learning_rate": 8.741552315196484e-06, "loss": 0.0328, "reward": 0.003159177373163402, "reward_std": 0.09949054196476936, "rewards/ndcg_rule_reward": -0.020278322510421276, "rewards/rule_reward": 0.0234375, "step": 839, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5092452258260078, "grad_norm": 2.1386313438415527, "kl": 72.5, "learning_rate": 8.738295325841136e-06, "loss": 0.0723, "reward": 0.004129479872062802, "reward_std": 0.12431911379098892, "rewards/ndcg_rule_reward": -0.02516739536076784, "rewards/rule_reward": 0.029296875, "step": 840, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5098514701424675, "grad_norm": 4.2106828689575195, "kl": 57.0, "learning_rate": 8.735034735661162e-06, "loss": 0.057, "reward": 0.0036953294184058905, "reward_std": 0.11608575284481049, "rewards/ndcg_rule_reward": -0.02364841988310218, "rewards/rule_reward": 0.02734375, "step": 841, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.510457714458927, "grad_norm": 1.41499924659729, "kl": 36.75, "learning_rate": 8.731770547797252e-06, "loss": 0.0367, "reward": 0.0036631253315135837, "reward_std": 0.13295059651136398, "rewards/ndcg_rule_reward": -0.02758687548339367, "rewards/rule_reward": 0.03125, "step": 842, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0078125, "epoch": 0.5110639587753865, "grad_norm": 1.43031907081604, "kl": 24.8125, "learning_rate": 8.728502765393554e-06, "loss": 0.0248, "reward": 0.0029768507229164243, "reward_std": 0.1080179437994957, "rewards/ndcg_rule_reward": -0.022413773462176323, "rewards/rule_reward": 0.025390625, "step": 843, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.07421875, "epoch": 0.511670203091846, "grad_norm": 4.697606563568115, "kl": 30.78125, "learning_rate": 8.72523139159768e-06, "loss": 0.0308, "reward": 0.0027963934116996825, "reward_std": 0.09970634803175926, "rewards/ndcg_rule_reward": -0.02064110664650798, "rewards/rule_reward": 0.0234375, "step": 844, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5122764474083056, "grad_norm": 2.125917673110962, "kl": 19.25, "learning_rate": 8.721956429560711e-06, "loss": 0.0192, "reward": 0.0030522916931658983, "reward_std": 0.09958575665950775, "rewards/ndcg_rule_reward": -0.020385209005326033, "rewards/rule_reward": 0.0234375, "step": 845, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5128826917247651, "grad_norm": 0.806122899055481, "kl": 27.1875, "learning_rate": 8.718677882437172e-06, "loss": 0.0272, "reward": 0.0025371951051056385, "reward_std": 0.0745576024055481, "rewards/ndcg_rule_reward": -0.015040929894894361, "rewards/rule_reward": 0.017578125, "step": 846, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5134889360412246, "grad_norm": 1.996954321861267, "kl": 35.1875, "learning_rate": 8.715395753385048e-06, "loss": 0.0351, "reward": 0.004252597922459245, "reward_std": 0.13264526426792145, "rewards/ndcg_rule_reward": -0.0269974023103714, "rewards/rule_reward": 0.03125, "step": 847, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5140951803576842, "grad_norm": 1.6508082151412964, "kl": 26.8125, "learning_rate": 8.712110045565768e-06, "loss": 0.0269, "reward": 0.003724991111084819, "reward_std": 0.12450532242655754, "rewards/ndcg_rule_reward": -0.025571884587407112, "rewards/rule_reward": 0.029296875, "step": 848, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5147014246741437, "grad_norm": 2.9772489070892334, "kl": 30.09375, "learning_rate": 8.708820762144217e-06, "loss": 0.0301, "reward": 0.003724142792634666, "reward_std": 0.14130057394504547, "rewards/ndcg_rule_reward": -0.029478982090950012, "rewards/rule_reward": 0.033203125, "step": 849, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5153076689906032, "grad_norm": 1.8396986722946167, "kl": 30.625, "learning_rate": 8.705527906288718e-06, "loss": 0.0306, "reward": 0.0029195542447268963, "reward_std": 0.10806262493133545, "rewards/ndcg_rule_reward": -0.022471072152256966, "rewards/rule_reward": 0.025390625, "step": 850, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5159139133070627, "grad_norm": 1.315307378768921, "kl": 26.90625, "learning_rate": 8.702231481171036e-06, "loss": 0.0269, "reward": 0.0025659791426733136, "reward_std": 0.09981205314397812, "rewards/ndcg_rule_reward": -0.020871521905064583, "rewards/rule_reward": 0.0234375, "step": 851, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5165201576235223, "grad_norm": 1.7524386644363403, "kl": 19.1875, "learning_rate": 8.698931489966373e-06, "loss": 0.0191, "reward": 0.002499878639355302, "reward_std": 0.09139997884631157, "rewards/ndcg_rule_reward": -0.018984496127814054, "rewards/rule_reward": 0.021484375, "step": 852, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5171264019399818, "grad_norm": 2.1259703636169434, "kl": 35.875, "learning_rate": 8.695627935853372e-06, "loss": 0.0358, "reward": 0.0030364124104380608, "reward_std": 0.10795709863305092, "rewards/ndcg_rule_reward": -0.022354213520884514, "rewards/rule_reward": 0.025390625, "step": 853, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5177326462564413, "grad_norm": 1.6688652038574219, "kl": 20.25, "learning_rate": 8.692320822014099e-06, "loss": 0.0203, "reward": 0.0017608001362532377, "reward_std": 0.06649668514728546, "rewards/ndcg_rule_reward": -0.013864199630916119, "rewards/rule_reward": 0.015625, "step": 854, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5183388905729008, "grad_norm": 1.225615382194519, "kl": 18.59375, "learning_rate": 8.689010151634058e-06, "loss": 0.0186, "reward": 0.0029410759452730417, "reward_std": 0.09962964057922363, "rewards/ndcg_rule_reward": -0.020496423356235027, "rewards/rule_reward": 0.0234375, "step": 855, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5189451348893604, "grad_norm": 2.4142863750457764, "kl": 39.3125, "learning_rate": 8.68569592790217e-06, "loss": 0.0394, "reward": 0.003733514924533665, "reward_std": 0.1244717501103878, "rewards/ndcg_rule_reward": -0.025563360191881657, "rewards/rule_reward": 0.029296875, "step": 856, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.51955137920582, "grad_norm": 1.416590929031372, "kl": 38.1875, "learning_rate": 8.682378154010785e-06, "loss": 0.0382, "reward": 0.004615939222276211, "reward_std": 0.12407238036394119, "rewards/ndcg_rule_reward": -0.02468093577772379, "rewards/rule_reward": 0.029296875, "step": 857, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5201576235222795, "grad_norm": 1.4344450235366821, "kl": 40.5, "learning_rate": 8.679056833155667e-06, "loss": 0.0405, "reward": 0.0032096265349537134, "reward_std": 0.11634047701954842, "rewards/ndcg_rule_reward": -0.02413412369787693, "rewards/rule_reward": 0.02734375, "step": 858, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.520763867838739, "grad_norm": 1.2302109003067017, "kl": 13.28125, "learning_rate": 8.675731968536004e-06, "loss": 0.0133, "reward": 0.003950607730075717, "reward_std": 0.11597645655274391, "rewards/ndcg_rule_reward": -0.023393141105771065, "rewards/rule_reward": 0.02734375, "step": 859, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.5213701121551986, "grad_norm": 1.891748309135437, "kl": 39.34375, "learning_rate": 8.67240356335439e-06, "loss": 0.0394, "reward": 0.003629771526902914, "reward_std": 0.12456704676151276, "rewards/ndcg_rule_reward": -0.025667103938758373, "rewards/rule_reward": 0.029296875, "step": 860, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5219763564716581, "grad_norm": 1.4316284656524658, "kl": 34.9375, "learning_rate": 8.669071620816834e-06, "loss": 0.0349, "reward": 0.0024362874682992697, "reward_std": 0.0745907798409462, "rewards/ndcg_rule_reward": -0.015141838230192661, "rewards/rule_reward": 0.017578125, "step": 861, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5225826007881176, "grad_norm": 1.8459501266479492, "kl": 22.4375, "learning_rate": 8.66573614413275e-06, "loss": 0.0224, "reward": 0.003111277474090457, "reward_std": 0.10796783491969109, "rewards/ndcg_rule_reward": -0.022279348224401474, "rewards/rule_reward": 0.025390625, "step": 862, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5231888451045772, "grad_norm": 2.2616076469421387, "kl": 32.5, "learning_rate": 8.662397136514959e-06, "loss": 0.0325, "reward": 0.003795394441112876, "reward_std": 0.13291903212666512, "rewards/ndcg_rule_reward": -0.027454604394733906, "rewards/rule_reward": 0.03125, "step": 863, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5237950894210367, "grad_norm": 2.1781129837036133, "kl": 43.8125, "learning_rate": 8.65905460117968e-06, "loss": 0.0438, "reward": 0.0038664196617901325, "reward_std": 0.12446432188153267, "rewards/ndcg_rule_reward": -0.025430455803871155, "rewards/rule_reward": 0.029296875, "step": 864, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5244013337374962, "grad_norm": 1.9545068740844727, "kl": 30.03125, "learning_rate": 8.655708541346532e-06, "loss": 0.03, "reward": 0.0032234329264611006, "reward_std": 0.09950394183397293, "rewards/ndcg_rule_reward": -0.020214068237692118, "rewards/rule_reward": 0.0234375, "step": 865, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5250075780539557, "grad_norm": 7.401889801025391, "kl": 33.1875, "learning_rate": 8.652358960238527e-06, "loss": 0.0332, "reward": 0.0035272528184577823, "reward_std": 0.11617937311530113, "rewards/ndcg_rule_reward": -0.023816497065126896, "rewards/rule_reward": 0.02734375, "step": 866, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5256138223704153, "grad_norm": 1.582249402999878, "kl": 22.3125, "learning_rate": 8.649005861082071e-06, "loss": 0.0223, "reward": 0.0038212923100218177, "reward_std": 0.11605127155780792, "rewards/ndcg_rule_reward": -0.023522458039224148, "rewards/rule_reward": 0.02734375, "step": 867, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5262200666868748, "grad_norm": 1.2647182941436768, "kl": 21.875, "learning_rate": 8.645649247106956e-06, "loss": 0.0219, "reward": 0.0028495790902525187, "reward_std": 0.09965132176876068, "rewards/ndcg_rule_reward": -0.020587921142578125, "rewards/rule_reward": 0.0234375, "step": 868, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.5268263110033343, "grad_norm": 1.586734652519226, "kl": 21.6875, "learning_rate": 8.64228912154636e-06, "loss": 0.0217, "reward": 0.0022118614288046956, "reward_std": 0.08312150090932846, "rewards/ndcg_rule_reward": -0.017319388687610626, "rewards/rule_reward": 0.01953125, "step": 869, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5274325553197938, "grad_norm": 1.1242060661315918, "kl": 27.125, "learning_rate": 8.638925487636847e-06, "loss": 0.0271, "reward": 0.0031115029705688357, "reward_std": 0.09954112395644188, "rewards/ndcg_rule_reward": -0.020325996913015842, "rewards/rule_reward": 0.0234375, "step": 870, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5280387996362534, "grad_norm": 3.3629462718963623, "kl": 29.1875, "learning_rate": 8.635558348618359e-06, "loss": 0.0291, "reward": 0.0030463580042123795, "reward_std": 0.1332457736134529, "rewards/ndcg_rule_reward": -0.028203641064465046, "rewards/rule_reward": 0.03125, "step": 871, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.528645043952713, "grad_norm": 2.382452964782715, "kl": 26.0625, "learning_rate": 8.63218770773421e-06, "loss": 0.026, "reward": 0.0038737431168556213, "reward_std": 0.14126275479793549, "rewards/ndcg_rule_reward": -0.02932938188314438, "rewards/rule_reward": 0.033203125, "step": 872, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5292512882691724, "grad_norm": 1.5585598945617676, "kl": 19.875, "learning_rate": 8.628813568231092e-06, "loss": 0.0199, "reward": 0.0033531549852341413, "reward_std": 0.11625805497169495, "rewards/ndcg_rule_reward": -0.02399059385061264, "rewards/rule_reward": 0.02734375, "step": 873, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5298575325856321, "grad_norm": 1.636635422706604, "kl": 45.75, "learning_rate": 8.625435933359063e-06, "loss": 0.0457, "reward": 0.004471228690817952, "reward_std": 0.14097342640161514, "rewards/ndcg_rule_reward": -0.028731897473335266, "rewards/rule_reward": 0.033203125, "step": 874, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5304637769020916, "grad_norm": 1.5205906629562378, "kl": 40.4375, "learning_rate": 8.622054806371553e-06, "loss": 0.0404, "reward": 0.0033829277381300926, "reward_std": 0.10782491788268089, "rewards/ndcg_rule_reward": -0.022007698193192482, "rewards/rule_reward": 0.025390625, "step": 875, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.470703125, "epoch": 0.5310700212185511, "grad_norm": 2.137173891067505, "kl": 32.375, "learning_rate": 8.61867019052535e-06, "loss": 0.0324, "reward": 0.003031126456335187, "reward_std": 0.11642274260520935, "rewards/ndcg_rule_reward": -0.024312623776495457, "rewards/rule_reward": 0.02734375, "step": 876, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5316762655350106, "grad_norm": 1.7142468690872192, "kl": 52.625, "learning_rate": 8.61528208908061e-06, "loss": 0.0526, "reward": 0.003881171578541398, "reward_std": 0.13282670825719833, "rewards/ndcg_rule_reward": -0.02736882958561182, "rewards/rule_reward": 0.03125, "step": 877, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5322825098514702, "grad_norm": 2.0747928619384766, "kl": 41.0, "learning_rate": 8.611890505300836e-06, "loss": 0.041, "reward": 0.003153227095026523, "reward_std": 0.09952275454998016, "rewards/ndcg_rule_reward": -0.02028427366167307, "rewards/rule_reward": 0.0234375, "step": 878, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.5328887541679297, "grad_norm": 1.3786859512329102, "kl": 49.875, "learning_rate": 8.608495442452892e-06, "loss": 0.0499, "reward": 0.004443885292857885, "reward_std": 0.12418331950902939, "rewards/ndcg_rule_reward": -0.024852990172803402, "rewards/rule_reward": 0.029296875, "step": 879, "token_diversity": 0.26222047018348627 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.5334949984843892, "grad_norm": 1.4170057773590088, "kl": 22.5, "learning_rate": 8.605096903806991e-06, "loss": 0.0225, "reward": 0.0020084448624402285, "reward_std": 0.05796088092029095, "rewards/ndcg_rule_reward": -0.011663430137559772, "rewards/rule_reward": 0.013671875, "step": 880, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5341012428008487, "grad_norm": 1.315836787223816, "kl": 33.1875, "learning_rate": 8.601694892636701e-06, "loss": 0.0332, "reward": 0.0031279895920306444, "reward_std": 0.0911063551902771, "rewards/ndcg_rule_reward": -0.0183563856408, "rewards/rule_reward": 0.021484375, "step": 881, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5347074871173083, "grad_norm": 1.567873477935791, "kl": 40.5, "learning_rate": 8.598289412218923e-06, "loss": 0.0405, "reward": 0.0043457894353196025, "reward_std": 0.13264350593090057, "rewards/ndcg_rule_reward": -0.026904210448265076, "rewards/rule_reward": 0.03125, "step": 882, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5353137314337678, "grad_norm": 1.5204740762710571, "kl": 21.6875, "learning_rate": 8.594880465833908e-06, "loss": 0.0217, "reward": 0.003086360404267907, "reward_std": 0.09955230727791786, "rewards/ndcg_rule_reward": -0.020351139828562737, "rewards/rule_reward": 0.0234375, "step": 883, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5359199757502273, "grad_norm": 2.7871577739715576, "kl": 43.875, "learning_rate": 8.591468056765243e-06, "loss": 0.0438, "reward": 0.003998373867943883, "reward_std": 0.10753435641527176, "rewards/ndcg_rule_reward": -0.021392252296209335, "rewards/rule_reward": 0.025390625, "step": 884, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5365262200666868, "grad_norm": 1.5639656782150269, "kl": 29.5, "learning_rate": 8.58805218829985e-06, "loss": 0.0295, "reward": 0.004615938989445567, "reward_std": 0.12407238408923149, "rewards/ndcg_rule_reward": -0.02468093577772379, "rewards/rule_reward": 0.029296875, "step": 885, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5371324643831464, "grad_norm": 1.6117914915084839, "kl": 49.0625, "learning_rate": 8.584632863727982e-06, "loss": 0.0489, "reward": 0.0023113699862733483, "reward_std": 0.08305826410651207, "rewards/ndcg_rule_reward": -0.017219880130141973, "rewards/rule_reward": 0.01953125, "step": 886, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5377387086996059, "grad_norm": 1.1953808069229126, "kl": 29.5, "learning_rate": 8.581210086343226e-06, "loss": 0.0295, "reward": 0.004453535657376051, "reward_std": 0.13258694112300873, "rewards/ndcg_rule_reward": -0.02679646573960781, "rewards/rule_reward": 0.03125, "step": 887, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5383449530160654, "grad_norm": 1.6234121322631836, "kl": 22.1875, "learning_rate": 8.577783859442488e-06, "loss": 0.0222, "reward": 0.003354538755957037, "reward_std": 0.10785576701164246, "rewards/ndcg_rule_reward": -0.022036085836589336, "rewards/rule_reward": 0.025390625, "step": 888, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5389511973325251, "grad_norm": 1.8057562112808228, "kl": 23.5, "learning_rate": 8.574354186326001e-06, "loss": 0.0235, "reward": 0.0031888416269794106, "reward_std": 0.1163768619298935, "rewards/ndcg_rule_reward": -0.024154908023774624, "rewards/rule_reward": 0.02734375, "step": 889, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5395574416489846, "grad_norm": 1.0084275007247925, "kl": 40.0625, "learning_rate": 8.570921070297318e-06, "loss": 0.0401, "reward": 0.003162506502121687, "reward_std": 0.09109838679432869, "rewards/ndcg_rule_reward": -0.018321868032217026, "rewards/rule_reward": 0.021484375, "step": 890, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5401636859654441, "grad_norm": 2.269179344177246, "kl": 55.25, "learning_rate": 8.567484514663307e-06, "loss": 0.0552, "reward": 0.002455294947139919, "reward_std": 0.07458599656820297, "rewards/ndcg_rule_reward": -0.01512282993644476, "rewards/rule_reward": 0.017578125, "step": 891, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5407699302819036, "grad_norm": 3.298006772994995, "kl": 59.75, "learning_rate": 8.564044522734147e-06, "loss": 0.0598, "reward": 0.0034963947255164385, "reward_std": 0.10779277235269547, "rewards/ndcg_rule_reward": -0.02189423143863678, "rewards/rule_reward": 0.025390625, "step": 892, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5413761745983632, "grad_norm": 1.6193019151687622, "kl": 20.125, "learning_rate": 8.56060109782333e-06, "loss": 0.0201, "reward": 0.0018765838467516005, "reward_std": 0.0748646929860115, "rewards/ndcg_rule_reward": -0.015701541677117348, "rewards/rule_reward": 0.017578125, "step": 893, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.5419824189148227, "grad_norm": 1.9236129522323608, "kl": 71.75, "learning_rate": 8.557154243247655e-06, "loss": 0.0717, "reward": 0.004021170549094677, "reward_std": 0.1243537962436676, "rewards/ndcg_rule_reward": -0.025275705382227898, "rewards/rule_reward": 0.029296875, "step": 894, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.73828125, "epoch": 0.5425886632312822, "grad_norm": 1.1583739519119263, "kl": 29.1875, "learning_rate": 8.55370396232722e-06, "loss": 0.0292, "reward": 0.0023315525613725185, "reward_std": 0.06622567400336266, "rewards/ndcg_rule_reward": -0.013293447904288769, "rewards/rule_reward": 0.015625, "step": 895, "token_diversity": 0.27443065693430657 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.5431949075477417, "grad_norm": 2.3730571269989014, "kl": 37.6875, "learning_rate": 8.550250258385429e-06, "loss": 0.0377, "reward": 0.004784397315233946, "reward_std": 0.14924107491970062, "rewards/ndcg_rule_reward": -0.030371851287782192, "rewards/rule_reward": 0.03515625, "step": 896, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 8.603515625, "epoch": 0.5438011518642013, "grad_norm": 35.474544525146484, "kl": 130.4375, "learning_rate": 8.54679313474898e-06, "loss": 0.1302, "reward": 0.0034475058782845736, "reward_std": 0.12465604394674301, "rewards/ndcg_rule_reward": -0.025849368423223495, "rewards/rule_reward": 0.029296875, "step": 897, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5444073961806608, "grad_norm": 1.687143087387085, "kl": 25.5625, "learning_rate": 8.543332594747864e-06, "loss": 0.0255, "reward": 0.0026680855080485344, "reward_std": 0.09973905980587006, "rewards/ndcg_rule_reward": -0.02076941542327404, "rewards/rule_reward": 0.0234375, "step": 898, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5450136404971203, "grad_norm": 1.509682536125183, "kl": 21.03125, "learning_rate": 8.53986864171537e-06, "loss": 0.021, "reward": 0.00352305022533983, "reward_std": 0.11617817729711533, "rewards/ndcg_rule_reward": -0.02382069919258356, "rewards/rule_reward": 0.02734375, "step": 899, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5456198848135799, "grad_norm": 1.0344284772872925, "kl": 20.25, "learning_rate": 8.536401278988063e-06, "loss": 0.0202, "reward": 0.0025301300920546055, "reward_std": 0.07455902546644211, "rewards/ndcg_rule_reward": -0.015047994907945395, "rewards/rule_reward": 0.017578125, "step": 900, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5462261291300394, "grad_norm": 2.513359785079956, "kl": 29.375, "learning_rate": 8.532930509905799e-06, "loss": 0.0294, "reward": 0.004539871588349342, "reward_std": 0.1577805131673813, "rewards/ndcg_rule_reward": -0.03256950434297323, "rewards/rule_reward": 0.037109375, "step": 901, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5468323734464989, "grad_norm": 1.2726280689239502, "kl": 18.109375, "learning_rate": 8.529456337811716e-06, "loss": 0.0181, "reward": 0.003104292554780841, "reward_std": 0.10793888568878174, "rewards/ndcg_rule_reward": -0.022286332212388515, "rewards/rule_reward": 0.025390625, "step": 902, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5474386177629584, "grad_norm": 1.464083194732666, "kl": 29.9375, "learning_rate": 8.52597876605223e-06, "loss": 0.0299, "reward": 0.00396476814057678, "reward_std": 0.11597340553998947, "rewards/ndcg_rule_reward": -0.02337898127734661, "rewards/rule_reward": 0.02734375, "step": 903, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.548044862079418, "grad_norm": 3.4753754138946533, "kl": 40.5, "learning_rate": 8.522497797977026e-06, "loss": 0.0404, "reward": 0.0035443734377622604, "reward_std": 0.11619797348976135, "rewards/ndcg_rule_reward": -0.023799377493560314, "rewards/rule_reward": 0.02734375, "step": 904, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5486511063958776, "grad_norm": 2.3317811489105225, "kl": 41.5, "learning_rate": 8.519013436939062e-06, "loss": 0.0414, "reward": 0.0039700373308733106, "reward_std": 0.13279059529304504, "rewards/ndcg_rule_reward": -0.02727996278554201, "rewards/rule_reward": 0.03125, "step": 905, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5492573507123371, "grad_norm": 1.7934266328811646, "kl": 24.5, "learning_rate": 8.515525686294576e-06, "loss": 0.0245, "reward": 0.003521244740113616, "reward_std": 0.12457231432199478, "rewards/ndcg_rule_reward": -0.025775629095733166, "rewards/rule_reward": 0.029296875, "step": 906, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5498635950287966, "grad_norm": 1.450993299484253, "kl": 47.625, "learning_rate": 8.512034549403053e-06, "loss": 0.0476, "reward": 0.003485808614641428, "reward_std": 0.11618334800004959, "rewards/ndcg_rule_reward": -0.02385794185101986, "rewards/rule_reward": 0.02734375, "step": 907, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5504698393452562, "grad_norm": 1.761056900024414, "kl": 68.125, "learning_rate": 8.50854002962725e-06, "loss": 0.0681, "reward": 0.004251979989930987, "reward_std": 0.13264402002096176, "rewards/ndcg_rule_reward": -0.026998020708560944, "rewards/rule_reward": 0.03125, "step": 908, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5510760836617157, "grad_norm": 0.9727344512939453, "kl": 35.625, "learning_rate": 8.505042130333182e-06, "loss": 0.0356, "reward": 0.002760774688795209, "reward_std": 0.07445616647601128, "rewards/ndcg_rule_reward": -0.014817351009696722, "rewards/rule_reward": 0.017578125, "step": 909, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5516823279781752, "grad_norm": 1.786998987197876, "kl": 51.625, "learning_rate": 8.501540854890117e-06, "loss": 0.0516, "reward": 0.004514191881753504, "reward_std": 0.15778061002492905, "rewards/ndcg_rule_reward": -0.032595181837677956, "rewards/rule_reward": 0.037109375, "step": 910, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5522885722946347, "grad_norm": 1.031164526939392, "kl": 47.5, "learning_rate": 8.498036206670577e-06, "loss": 0.0474, "reward": 0.0032184049487113953, "reward_std": 0.09107794240117073, "rewards/ndcg_rule_reward": -0.018265969585627317, "rewards/rule_reward": 0.021484375, "step": 911, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5528948166110943, "grad_norm": 2.629220724105835, "kl": 24.78125, "learning_rate": 8.494528189050328e-06, "loss": 0.0248, "reward": 0.0023512584739364684, "reward_std": 0.07464930228888988, "rewards/ndcg_rule_reward": -0.01522686704993248, "rewards/rule_reward": 0.017578125, "step": 912, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5535010609275538, "grad_norm": 2.0689022541046143, "kl": 24.625, "learning_rate": 8.491016805408387e-06, "loss": 0.0246, "reward": 0.0032655945979058743, "reward_std": 0.12471742182970047, "rewards/ndcg_rule_reward": -0.026031280867755413, "rewards/rule_reward": 0.029296875, "step": 913, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5541073052440133, "grad_norm": 2.5808587074279785, "kl": 23.75, "learning_rate": 8.487502059127015e-06, "loss": 0.0238, "reward": 0.002729161409661174, "reward_std": 0.09132520109415054, "rewards/ndcg_rule_reward": -0.018755214754492044, "rewards/rule_reward": 0.021484375, "step": 914, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.5547135495604729, "grad_norm": 1.8729811906814575, "kl": 53.0, "learning_rate": 8.483983953591703e-06, "loss": 0.053, "reward": 0.004315750673413277, "reward_std": 0.14944957941770554, "rewards/ndcg_rule_reward": -0.030840499326586723, "rewards/rule_reward": 0.03515625, "step": 915, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.025390625, "epoch": 0.5553197938769324, "grad_norm": 10.613578796386719, "kl": 41.625, "learning_rate": 8.480462492191187e-06, "loss": 0.0416, "reward": 0.0039476214442402124, "reward_std": 0.12440657243132591, "rewards/ndcg_rule_reward": -0.025349254719913006, "rewards/rule_reward": 0.029296875, "step": 916, "token_diversity": 0.4294104609929078 }, { "categorical_diversity": 1.0, "completion_length": 5.658203125, "epoch": 0.5559260381933919, "grad_norm": 1.6807794570922852, "kl": 18.109375, "learning_rate": 8.476937678317428e-06, "loss": 0.0181, "reward": 0.004110208945348859, "reward_std": 0.13275441527366638, "rewards/ndcg_rule_reward": -0.02713979221880436, "rewards/rule_reward": 0.03125, "step": 917, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5565322825098514, "grad_norm": 1.2849135398864746, "kl": 23.375, "learning_rate": 8.47340951536562e-06, "loss": 0.0234, "reward": 0.003268987755291164, "reward_std": 0.09946206584572792, "rewards/ndcg_rule_reward": -0.020168512128293514, "rewards/rule_reward": 0.0234375, "step": 918, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.557138526826311, "grad_norm": 1.3819679021835327, "kl": 16.40625, "learning_rate": 8.469878006734186e-06, "loss": 0.0164, "reward": 0.002970413537696004, "reward_std": 0.10800987482070923, "rewards/ndcg_rule_reward": -0.022420210763812065, "rewards/rule_reward": 0.025390625, "step": 919, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5577447711427705, "grad_norm": 3.264103651046753, "kl": 14.5625, "learning_rate": 8.466343155824765e-06, "loss": 0.0146, "reward": 0.0032961524557322264, "reward_std": 0.10788486525416374, "rewards/ndcg_rule_reward": -0.022094471380114555, "rewards/rule_reward": 0.025390625, "step": 920, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.55835101545923, "grad_norm": 1.3906376361846924, "kl": 19.65625, "learning_rate": 8.462804966042219e-06, "loss": 0.0197, "reward": 0.0032336796866729856, "reward_std": 0.10786217451095581, "rewards/ndcg_rule_reward": -0.02215694636106491, "rewards/rule_reward": 0.025390625, "step": 921, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.5589572597756896, "grad_norm": 1.5260610580444336, "kl": 51.125, "learning_rate": 8.459263440794627e-06, "loss": 0.0511, "reward": 0.004392862552776933, "reward_std": 0.1326029673218727, "rewards/ndcg_rule_reward": -0.02685713768005371, "rewards/rule_reward": 0.03125, "step": 922, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5595635040921492, "grad_norm": 1.627488374710083, "kl": 26.875, "learning_rate": 8.45571858349328e-06, "loss": 0.0269, "reward": 0.002580973319709301, "reward_std": 0.09980277344584465, "rewards/ndcg_rule_reward": -0.020856525748968124, "rewards/rule_reward": 0.0234375, "step": 923, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.5601697484086087, "grad_norm": 1.6884793043136597, "kl": 54.25, "learning_rate": 8.452170397552676e-06, "loss": 0.0543, "reward": 0.003513246076181531, "reward_std": 0.11617621779441833, "rewards/ndcg_rule_reward": -0.023830505087971687, "rewards/rule_reward": 0.02734375, "step": 924, "token_diversity": 0.38343253968253965 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5607759927250682, "grad_norm": 1.3451703786849976, "kl": 44.125, "learning_rate": 8.448618886390523e-06, "loss": 0.044, "reward": 0.002803342940751463, "reward_std": 0.08284439891576767, "rewards/ndcg_rule_reward": -0.016727907583117485, "rewards/rule_reward": 0.01953125, "step": 925, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5613822370415278, "grad_norm": 1.619835376739502, "kl": 40.125, "learning_rate": 8.445064053427728e-06, "loss": 0.0402, "reward": 0.003908440354280174, "reward_std": 0.12442228198051453, "rewards/ndcg_rule_reward": -0.025388434529304504, "rewards/rule_reward": 0.029296875, "step": 926, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.5619884813579873, "grad_norm": 1.294687032699585, "kl": 37.25, "learning_rate": 8.4415059020884e-06, "loss": 0.0373, "reward": 0.0021207560785114765, "reward_std": 0.07475308328866959, "rewards/ndcg_rule_reward": -0.015457369852811098, "rewards/rule_reward": 0.017578125, "step": 927, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5625947256744468, "grad_norm": 0.8468235731124878, "kl": 21.3125, "learning_rate": 8.437944435799847e-06, "loss": 0.0214, "reward": 0.0017231033998541534, "reward_std": 0.05811327323317528, "rewards/ndcg_rule_reward": -0.011948771309107542, "rewards/rule_reward": 0.013671875, "step": 928, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5632009699909063, "grad_norm": 2.0736968517303467, "kl": 24.5, "learning_rate": 8.434379657992567e-06, "loss": 0.0246, "reward": 0.0032127664890140295, "reward_std": 0.1163032054901123, "rewards/ndcg_rule_reward": -0.0241309842094779, "rewards/rule_reward": 0.02734375, "step": 929, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5638072143073659, "grad_norm": 1.584747314453125, "kl": 23.75, "learning_rate": 8.430811572100245e-06, "loss": 0.0238, "reward": 0.0034208837896585464, "reward_std": 0.10780692100524902, "rewards/ndcg_rule_reward": -0.021969742141664028, "rewards/rule_reward": 0.025390625, "step": 930, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5644134586238254, "grad_norm": 1.8990256786346436, "kl": 17.125, "learning_rate": 8.427240181559754e-06, "loss": 0.0171, "reward": 0.002335108525585383, "reward_std": 0.09148753434419632, "rewards/ndcg_rule_reward": -0.0191492666490376, "rewards/rule_reward": 0.021484375, "step": 931, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5650197029402849, "grad_norm": 2.2019035816192627, "kl": 15.8125, "learning_rate": 8.423665489811159e-06, "loss": 0.0158, "reward": 0.002953467075712979, "reward_std": 0.09964808821678162, "rewards/ndcg_rule_reward": -0.020484033040702343, "rewards/rule_reward": 0.0234375, "step": 932, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5656259472567444, "grad_norm": 1.4397499561309814, "kl": 23.0625, "learning_rate": 8.420087500297691e-06, "loss": 0.0231, "reward": 0.0031369600910693407, "reward_std": 0.1079283319413662, "rewards/ndcg_rule_reward": -0.022253666073083878, "rewards/rule_reward": 0.025390625, "step": 933, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.566232191573204, "grad_norm": 1.617465615272522, "kl": 38.0625, "learning_rate": 8.416506216465766e-06, "loss": 0.038, "reward": 0.004676567390561104, "reward_std": 0.14088357985019684, "rewards/ndcg_rule_reward": -0.028526557609438896, "rewards/rule_reward": 0.033203125, "step": 934, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5668384358896635, "grad_norm": 1.1486815214157104, "kl": 21.1875, "learning_rate": 8.41292164176497e-06, "loss": 0.0212, "reward": 0.0026255007251165807, "reward_std": 0.09976042807102203, "rewards/ndcg_rule_reward": -0.02081199921667576, "rewards/rule_reward": 0.0234375, "step": 935, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.567444680206123, "grad_norm": 2.2917304039001465, "kl": 38.25, "learning_rate": 8.40933377964806e-06, "loss": 0.0381, "reward": 0.0037374760722741485, "reward_std": 0.12450544163584709, "rewards/ndcg_rule_reward": -0.025559398345649242, "rewards/rule_reward": 0.029296875, "step": 936, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5680509245225825, "grad_norm": 1.2727445363998413, "kl": 35.125, "learning_rate": 8.405742633570961e-06, "loss": 0.0351, "reward": 0.002816347754560411, "reward_std": 0.09127026051282883, "rewards/ndcg_rule_reward": -0.018668027594685555, "rewards/rule_reward": 0.021484375, "step": 937, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5686571688390422, "grad_norm": 1.7171714305877686, "kl": 63.75, "learning_rate": 8.402148206992758e-06, "loss": 0.0638, "reward": 0.004929474554955959, "reward_std": 0.1575956791639328, "rewards/ndcg_rule_reward": -0.03217990044504404, "rewards/rule_reward": 0.037109375, "step": 938, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5692634131555017, "grad_norm": 1.5684244632720947, "kl": 36.625, "learning_rate": 8.3985505033757e-06, "loss": 0.0366, "reward": 0.003090085694566369, "reward_std": 0.10794151946902275, "rewards/ndcg_rule_reward": -0.022300539072602987, "rewards/rule_reward": 0.025390625, "step": 939, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5698696574719612, "grad_norm": 1.8344438076019287, "kl": 31.125, "learning_rate": 8.394949526185185e-06, "loss": 0.0312, "reward": 0.0034986671525985003, "reward_std": 0.1246151477098465, "rewards/ndcg_rule_reward": -0.025798209011554718, "rewards/rule_reward": 0.029296875, "step": 940, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.412109375, "epoch": 0.5704759017884208, "grad_norm": 1.753484845161438, "kl": 25.875, "learning_rate": 8.391345278889774e-06, "loss": 0.0259, "reward": 0.0040182743687182665, "reward_std": 0.12434874102473259, "rewards/ndcg_rule_reward": -0.025278599932789803, "rewards/rule_reward": 0.029296875, "step": 941, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5710821461048803, "grad_norm": 1.6183574199676514, "kl": 41.0, "learning_rate": 8.387737764961171e-06, "loss": 0.041, "reward": 0.003198630642145872, "reward_std": 0.10790745913982391, "rewards/ndcg_rule_reward": -0.022191994823515415, "rewards/rule_reward": 0.025390625, "step": 942, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5716883904213398, "grad_norm": 1.6090216636657715, "kl": 36.125, "learning_rate": 8.384126987874227e-06, "loss": 0.0361, "reward": 0.003793832380324602, "reward_std": 0.13290837034583092, "rewards/ndcg_rule_reward": -0.027456168085336685, "rewards/rule_reward": 0.03125, "step": 943, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5722946347377993, "grad_norm": 1.67598557472229, "kl": 36.75, "learning_rate": 8.380512951106942e-06, "loss": 0.0367, "reward": 0.0028249488677829504, "reward_std": 0.09965591132640839, "rewards/ndcg_rule_reward": -0.02061255183070898, "rewards/rule_reward": 0.0234375, "step": 944, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5729008790542589, "grad_norm": 1.6044796705245972, "kl": 51.625, "learning_rate": 8.376895658140447e-06, "loss": 0.0516, "reward": 0.0033893360523507, "reward_std": 0.09938818216323853, "rewards/ndcg_rule_reward": -0.02004816383123398, "rewards/rule_reward": 0.0234375, "step": 945, "token_diversity": 0.5390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5735071233707184, "grad_norm": 1.9437637329101562, "kl": 19.5625, "learning_rate": 8.373275112459016e-06, "loss": 0.0196, "reward": 0.002176293754018843, "reward_std": 0.0915667936205864, "rewards/ndcg_rule_reward": -0.019308081828057766, "rewards/rule_reward": 0.021484375, "step": 946, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5741133676871779, "grad_norm": 1.5916260480880737, "kl": 37.125, "learning_rate": 8.369651317550055e-06, "loss": 0.0371, "reward": 0.0036964984610676765, "reward_std": 0.11608364433050156, "rewards/ndcg_rule_reward": -0.02364725060760975, "rewards/rule_reward": 0.02734375, "step": 947, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5747196120036374, "grad_norm": 1.1239745616912842, "kl": 14.328125, "learning_rate": 8.366024276904096e-06, "loss": 0.0143, "reward": 0.002168630948290229, "reward_std": 0.07473953813314438, "rewards/ndcg_rule_reward": -0.015409493818879128, "rewards/rule_reward": 0.017578125, "step": 948, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.575325856320097, "grad_norm": 1.344870924949646, "kl": 22.6875, "learning_rate": 8.362393994014804e-06, "loss": 0.0226, "reward": 0.002123804995790124, "reward_std": 0.0747511014342308, "rewards/ndcg_rule_reward": -0.01545432023704052, "rewards/rule_reward": 0.017578125, "step": 949, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5759321006365565, "grad_norm": 1.1582412719726562, "kl": 27.75, "learning_rate": 8.358760472378961e-06, "loss": 0.0278, "reward": 0.0021630986593663692, "reward_std": 0.07473965734243393, "rewards/ndcg_rule_reward": -0.01541502634063363, "rewards/rule_reward": 0.017578125, "step": 950, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.576538344953016, "grad_norm": 1.7777979373931885, "kl": 26.1875, "learning_rate": 8.35512371549647e-06, "loss": 0.0262, "reward": 0.0033626118674874306, "reward_std": 0.10782298818230629, "rewards/ndcg_rule_reward": -0.022028014063835144, "rewards/rule_reward": 0.025390625, "step": 951, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5771445892694756, "grad_norm": 1.3753793239593506, "kl": 24.4375, "learning_rate": 8.351483726870351e-06, "loss": 0.0245, "reward": 0.0030696226749569178, "reward_std": 0.09955759719014168, "rewards/ndcg_rule_reward": -0.02036787662655115, "rewards/rule_reward": 0.0234375, "step": 952, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5777508335859352, "grad_norm": 1.6686192750930786, "kl": 38.1875, "learning_rate": 8.347840510006741e-06, "loss": 0.0383, "reward": 0.00390548980794847, "reward_std": 0.11599356308579445, "rewards/ndcg_rule_reward": -0.02343826089054346, "rewards/rule_reward": 0.02734375, "step": 953, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5783570779023947, "grad_norm": 0.9841766357421875, "kl": 30.8125, "learning_rate": 8.344194068414877e-06, "loss": 0.0308, "reward": 0.0031986989779397845, "reward_std": 0.0826543103903532, "rewards/ndcg_rule_reward": -0.016332551604136825, "rewards/rule_reward": 0.01953125, "step": 954, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5789633222188542, "grad_norm": 1.1328420639038086, "kl": 24.6875, "learning_rate": 8.340544405607112e-06, "loss": 0.0246, "reward": 0.003518568875733763, "reward_std": 0.09091974049806595, "rewards/ndcg_rule_reward": -0.017965806648135185, "rewards/rule_reward": 0.021484375, "step": 955, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5795695665353138, "grad_norm": 1.0740528106689453, "kl": 45.6875, "learning_rate": 8.336891525098894e-06, "loss": 0.0457, "reward": 0.0027188941603526473, "reward_std": 0.07447263039648533, "rewards/ndcg_rule_reward": -0.01485923188738525, "rewards/rule_reward": 0.017578125, "step": 956, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5801758108517733, "grad_norm": 1.4515827894210815, "kl": 37.375, "learning_rate": 8.333235430408776e-06, "loss": 0.0373, "reward": 0.0034011363750323653, "reward_std": 0.10781363770365715, "rewards/ndcg_rule_reward": -0.021989489905536175, "rewards/rule_reward": 0.025390625, "step": 957, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.5807820551682328, "grad_norm": 0.951810359954834, "kl": 15.78125, "learning_rate": 8.329576125058406e-06, "loss": 0.0158, "reward": 0.0025548447156324983, "reward_std": 0.07455358281731606, "rewards/ndcg_rule_reward": -0.015023281332105398, "rewards/rule_reward": 0.017578125, "step": 958, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5813882994846923, "grad_norm": 2.065507173538208, "kl": 22.1875, "learning_rate": 8.32591361257252e-06, "loss": 0.0222, "reward": 0.0028854734264314175, "reward_std": 0.09121432527899742, "rewards/ndcg_rule_reward": -0.018598902504891157, "rewards/rule_reward": 0.021484375, "step": 959, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5819945438011519, "grad_norm": 2.06252384185791, "kl": 35.0, "learning_rate": 8.32224789647895e-06, "loss": 0.0351, "reward": 0.0037407060153782368, "reward_std": 0.13293786346912384, "rewards/ndcg_rule_reward": -0.02750929445028305, "rewards/rule_reward": 0.03125, "step": 960, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.01953125, "epoch": 0.5826007881176114, "grad_norm": 2.8750500679016113, "kl": 31.5, "learning_rate": 8.31857898030861e-06, "loss": 0.0315, "reward": 0.0038707552012056112, "reward_std": 0.1412590742111206, "rewards/ndcg_rule_reward": -0.029332369565963745, "rewards/rule_reward": 0.033203125, "step": 961, "token_diversity": 0.47492074275362317 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5832070324340709, "grad_norm": 1.3986866474151611, "kl": 40.0625, "learning_rate": 8.31490686759549e-06, "loss": 0.04, "reward": 0.0037788068875670433, "reward_std": 0.12448959425091743, "rewards/ndcg_rule_reward": -0.025518068112432957, "rewards/rule_reward": 0.029296875, "step": 962, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.5838132767505305, "grad_norm": 1.8996198177337646, "kl": 45.625, "learning_rate": 8.311231561876675e-06, "loss": 0.0456, "reward": 0.004072489915415645, "reward_std": 0.12434132769703865, "rewards/ndcg_rule_reward": -0.025224385783076286, "rewards/rule_reward": 0.029296875, "step": 963, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.58441952106699, "grad_norm": 1.7389309406280518, "kl": 61.625, "learning_rate": 8.307553066692314e-06, "loss": 0.0615, "reward": 0.003985231276601553, "reward_std": 0.12435732781887054, "rewards/ndcg_rule_reward": -0.025311642326414585, "rewards/rule_reward": 0.029296875, "step": 964, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5850257653834495, "grad_norm": 7.46934700012207, "kl": 19.75, "learning_rate": 8.30387138558563e-06, "loss": 0.0197, "reward": 0.003729453543201089, "reward_std": 0.132908683270216, "rewards/ndcg_rule_reward": -0.02752054762095213, "rewards/rule_reward": 0.03125, "step": 965, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.185546875, "epoch": 0.585632009699909, "grad_norm": 6.056149482727051, "kl": 42.875, "learning_rate": 8.300186522102912e-06, "loss": 0.0427, "reward": 0.002842091955244541, "reward_std": 0.09122458845376968, "rewards/ndcg_rule_reward": -0.01864228304475546, "rewards/rule_reward": 0.021484375, "step": 966, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5862382540163686, "grad_norm": 1.816653847694397, "kl": 55.75, "learning_rate": 8.29649847979352e-06, "loss": 0.0558, "reward": 0.003762264852412045, "reward_std": 0.1076689288020134, "rewards/ndcg_rule_reward": -0.02162836119532585, "rewards/rule_reward": 0.025390625, "step": 967, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5868444983328281, "grad_norm": 1.2516777515411377, "kl": 44.25, "learning_rate": 8.292807262209874e-06, "loss": 0.0443, "reward": 0.0036620175233110785, "reward_std": 0.10770369693636894, "rewards/ndcg_rule_reward": -0.021728606894612312, "rewards/rule_reward": 0.025390625, "step": 968, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5874507426492876, "grad_norm": 1.046379804611206, "kl": 25.0, "learning_rate": 8.289112872907454e-06, "loss": 0.025, "reward": 0.0034595418255776167, "reward_std": 0.09936937317252159, "rewards/ndcg_rule_reward": -0.019977958407253027, "rewards/rule_reward": 0.0234375, "step": 969, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5880569869657472, "grad_norm": 1.3374660015106201, "kl": 29.28125, "learning_rate": 8.28541531544479e-06, "loss": 0.0293, "reward": 0.0030047232867218554, "reward_std": 0.09117461740970612, "rewards/ndcg_rule_reward": -0.018479651771485806, "rewards/rule_reward": 0.021484375, "step": 970, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 6.009765625, "epoch": 0.5886632312822068, "grad_norm": 0.9436930418014526, "kl": 24.75, "learning_rate": 8.281714593383467e-06, "loss": 0.0248, "reward": 0.0025839038426056504, "reward_std": 0.08294207975268364, "rewards/ndcg_rule_reward": -0.01694734673947096, "rewards/rule_reward": 0.01953125, "step": 971, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5892694755986663, "grad_norm": 2.629883289337158, "kl": 23.6875, "learning_rate": 8.27801071028812e-06, "loss": 0.0237, "reward": 0.001969058357644826, "reward_std": 0.08322521299123764, "rewards/ndcg_rule_reward": -0.01756219193339348, "rewards/rule_reward": 0.01953125, "step": 972, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5898757199151258, "grad_norm": 1.7532944679260254, "kl": 18.25, "learning_rate": 8.274303669726427e-06, "loss": 0.0183, "reward": 0.0031363058369606733, "reward_std": 0.12474939599633217, "rewards/ndcg_rule_reward": -0.026160569861531258, "rewards/rule_reward": 0.029296875, "step": 973, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5904819642315853, "grad_norm": 1.7320637702941895, "kl": 50.25, "learning_rate": 8.270593475269103e-06, "loss": 0.0503, "reward": 0.0034623537212610245, "reward_std": 0.10779639706015587, "rewards/ndcg_rule_reward": -0.021928271278738976, "rewards/rule_reward": 0.025390625, "step": 974, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5910882085480449, "grad_norm": 1.2609028816223145, "kl": 34.875, "learning_rate": 8.266880130489912e-06, "loss": 0.0349, "reward": 0.0032851400319486856, "reward_std": 0.09102313220500946, "rewards/ndcg_rule_reward": -0.01819923473522067, "rewards/rule_reward": 0.021484375, "step": 975, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5916944528645044, "grad_norm": 1.7872593402862549, "kl": 31.78125, "learning_rate": 8.263163638965639e-06, "loss": 0.0317, "reward": 0.003288191044703126, "reward_std": 0.1162370815873146, "rewards/ndcg_rule_reward": -0.024055558256804943, "rewards/rule_reward": 0.02734375, "step": 976, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.5923006971809639, "grad_norm": 1.2741398811340332, "kl": 38.6875, "learning_rate": 8.259444004276112e-06, "loss": 0.0388, "reward": 0.004180973628535867, "reward_std": 0.13270138204097748, "rewards/ndcg_rule_reward": -0.027069026604294777, "rewards/rule_reward": 0.03125, "step": 977, "token_diversity": 0.28507313829787234 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5929069414974235, "grad_norm": 3.396289348602295, "kl": 45.0, "learning_rate": 8.255721230004177e-06, "loss": 0.0449, "reward": 0.004129479522816837, "reward_std": 0.11588617414236069, "rewards/ndcg_rule_reward": -0.02321427036076784, "rewards/rule_reward": 0.02734375, "step": 978, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.593513185813883, "grad_norm": 79.2990493774414, "kl": 193.25, "learning_rate": 8.251995319735711e-06, "loss": 0.1939, "reward": 0.004419151460751891, "reward_std": 0.12416567653417587, "rewards/ndcg_rule_reward": -0.02487772423774004, "rewards/rule_reward": 0.029296875, "step": 979, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5941194301303425, "grad_norm": 1.6201298236846924, "kl": 20.71875, "learning_rate": 8.248266277059607e-06, "loss": 0.0207, "reward": 0.0035071101738139987, "reward_std": 0.10778052359819412, "rewards/ndcg_rule_reward": -0.02188351470977068, "rewards/rule_reward": 0.025390625, "step": 980, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.802734375, "epoch": 0.594725674446802, "grad_norm": 1.4537371397018433, "kl": 35.125, "learning_rate": 8.244534105567778e-06, "loss": 0.0351, "reward": 0.0027539387810975313, "reward_std": 0.09972748160362244, "rewards/ndcg_rule_reward": -0.020683561451733112, "rewards/rule_reward": 0.0234375, "step": 981, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5953319187632616, "grad_norm": 1.1658525466918945, "kl": 31.5, "learning_rate": 8.24079880885515e-06, "loss": 0.0315, "reward": 0.0022638231748715043, "reward_std": 0.07470765337347984, "rewards/ndcg_rule_reward": -0.015314301941543818, "rewards/rule_reward": 0.017578125, "step": 982, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5959381630797211, "grad_norm": 1.4229602813720703, "kl": 62.125, "learning_rate": 8.237060390519661e-06, "loss": 0.0621, "reward": 0.003740090411156416, "reward_std": 0.10767609253525734, "rewards/ndcg_rule_reward": -0.02165053505450487, "rewards/rule_reward": 0.025390625, "step": 983, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5965444073961806, "grad_norm": 2.477200984954834, "kl": 41.375, "learning_rate": 8.233318854162253e-06, "loss": 0.0414, "reward": 0.0031704921275377274, "reward_std": 0.11634943634271622, "rewards/ndcg_rule_reward": -0.024173258803784847, "rewards/rule_reward": 0.02734375, "step": 984, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5971506517126401, "grad_norm": 1.695927381515503, "kl": 40.125, "learning_rate": 8.22957420338687e-06, "loss": 0.0401, "reward": 0.003805827349424362, "reward_std": 0.14128300920128822, "rewards/ndcg_rule_reward": -0.029397299513220787, "rewards/rule_reward": 0.033203125, "step": 985, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5977568960290998, "grad_norm": 2.1485443115234375, "kl": 12.1875, "learning_rate": 8.225826441800462e-06, "loss": 0.0122, "reward": 0.0020731878466904163, "reward_std": 0.09159744530916214, "rewards/ndcg_rule_reward": -0.01941118761897087, "rewards/rule_reward": 0.021484375, "step": 986, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5983631403455593, "grad_norm": 1.5107086896896362, "kl": 29.375, "learning_rate": 8.222075573012971e-06, "loss": 0.0293, "reward": 0.0025186636485159397, "reward_std": 0.08299051970243454, "rewards/ndcg_rule_reward": -0.017012587748467922, "rewards/rule_reward": 0.01953125, "step": 987, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5989693846620188, "grad_norm": 1.6851502656936646, "kl": 17.5, "learning_rate": 8.21832160063733e-06, "loss": 0.0175, "reward": 0.004250679397955537, "reward_std": 0.1579413115978241, "rewards/ndcg_rule_reward": -0.03285869397222996, "rewards/rule_reward": 0.037109375, "step": 988, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.5995756289784784, "grad_norm": 1.5092267990112305, "kl": 17.8125, "learning_rate": 8.214564528289468e-06, "loss": 0.0178, "reward": 0.0021908465423621237, "reward_std": 0.07469641044735909, "rewards/ndcg_rule_reward": -0.01538727805018425, "rewards/rule_reward": 0.017578125, "step": 989, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6001818732949379, "grad_norm": 1.6289979219436646, "kl": 32.3125, "learning_rate": 8.210804359588294e-06, "loss": 0.0324, "reward": 0.0023637073463760316, "reward_std": 0.09147027134895325, "rewards/ndcg_rule_reward": -0.019120668526738882, "rewards/rule_reward": 0.021484375, "step": 990, "token_diversity": 0.5 }, { "epoch": 0.6001818732949379, "eval_categorical_diversity": 1.0, "eval_completion_length": 5.0, "eval_kl": 10.113336267605634, "eval_loss": 0.010131514631211758, "eval_reward": 0.0013449842394755917, "eval_reward_std": 0.04761533858910413, "eval_rewards/ndcg_rule_reward": -0.009851098591855295, "eval_rewards/rule_reward": 0.011196082746478873, "eval_runtime": 85.102, "eval_samples_per_second": 53.254, "eval_steps_per_second": 0.059, "eval_token_diversity": 0.33926606514084506, "step": 990 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6007881176113974, "grad_norm": 7.14940071105957, "kl": 37.625, "learning_rate": 8.207041098155701e-06, "loss": 0.0376, "reward": 0.003284825768787414, "reward_std": 0.10785052925348282, "rewards/ndcg_rule_reward": -0.022105798590928316, "rewards/rule_reward": 0.025390625, "step": 991, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6013943619278569, "grad_norm": 0.6329979300498962, "kl": 21.4375, "learning_rate": 8.203274747616562e-06, "loss": 0.0215, "reward": 0.0021683797240257263, "reward_std": 0.06631005927920341, "rewards/ndcg_rule_reward": -0.013456620275974274, "rewards/rule_reward": 0.015625, "step": 992, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6020006062443165, "grad_norm": 1.8029248714447021, "kl": 27.5, "learning_rate": 8.199505311598727e-06, "loss": 0.0274, "reward": 0.003478261991403997, "reward_std": 0.10779395326972008, "rewards/ndcg_rule_reward": -0.02191236335784197, "rewards/rule_reward": 0.025390625, "step": 993, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.602606850560776, "grad_norm": 1.7009028196334839, "kl": 39.125, "learning_rate": 8.195732793733014e-06, "loss": 0.0391, "reward": 0.003218405065126717, "reward_std": 0.09107794240117073, "rewards/ndcg_rule_reward": -0.01826597098261118, "rewards/rule_reward": 0.021484375, "step": 994, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.85546875, "epoch": 0.6032130948772355, "grad_norm": 1.3662285804748535, "kl": 24.9375, "learning_rate": 8.191957197653213e-06, "loss": 0.0249, "reward": 0.0034939259057864547, "reward_std": 0.10779064148664474, "rewards/ndcg_rule_reward": -0.02189669944345951, "rewards/rule_reward": 0.025390625, "step": 995, "token_diversity": 0.2673748897707231 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.603819339193695, "grad_norm": 1.523613691329956, "kl": 32.125, "learning_rate": 8.188178526996079e-06, "loss": 0.0322, "reward": 0.0033474330557510257, "reward_std": 0.10783369466662407, "rewards/ndcg_rule_reward": -0.022043191827833652, "rewards/rule_reward": 0.025390625, "step": 996, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.220703125, "epoch": 0.6044255835101546, "grad_norm": 3.2408206462860107, "kl": 46.125, "learning_rate": 8.184396785401322e-06, "loss": 0.0461, "reward": 0.004918662365525961, "reward_std": 0.14920391142368317, "rewards/ndcg_rule_reward": -0.030237587168812752, "rewards/rule_reward": 0.03515625, "step": 997, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6050318278266141, "grad_norm": 1.243701696395874, "kl": 34.125, "learning_rate": 8.180611976511621e-06, "loss": 0.0341, "reward": 0.002884517190977931, "reward_std": 0.08281648904085159, "rewards/ndcg_rule_reward": -0.016646733507514, "rewards/rule_reward": 0.01953125, "step": 998, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6056380721430736, "grad_norm": 2.8383405208587646, "kl": 45.375, "learning_rate": 8.1768241039726e-06, "loss": 0.0454, "reward": 0.0029948254232294858, "reward_std": 0.09117938205599785, "rewards/ndcg_rule_reward": -0.018489548936486244, "rewards/rule_reward": 0.021484375, "step": 999, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6062443164595331, "grad_norm": 1.7816743850708008, "kl": 37.296875, "learning_rate": 8.173033171432841e-06, "loss": 0.0373, "reward": 0.002005973015911877, "reward_std": 0.07478894293308258, "rewards/ndcg_rule_reward": -0.015572152100503445, "rewards/rule_reward": 0.017578125, "step": 1000, "token_diversity": 0.53125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6068505607759928, "grad_norm": 1.5482115745544434, "kl": 34.625, "learning_rate": 8.169239182543869e-06, "loss": 0.0345, "reward": 0.0029283309122547507, "reward_std": 0.09959358349442482, "rewards/ndcg_rule_reward": -0.020509169436991215, "rewards/rule_reward": 0.0234375, "step": 1001, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6074568050924523, "grad_norm": 1.2235958576202393, "kl": 49.5, "learning_rate": 8.16544214096015e-06, "loss": 0.0495, "reward": 0.0033908954355865717, "reward_std": 0.10782037675380707, "rewards/ndcg_rule_reward": -0.021999729797244072, "rewards/rule_reward": 0.025390625, "step": 1002, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6080630494089118, "grad_norm": 1.7826368808746338, "kl": 29.09375, "learning_rate": 8.1616420503391e-06, "loss": 0.029, "reward": 0.004236184526234865, "reward_std": 0.14110822975635529, "rewards/ndcg_rule_reward": -0.028966940939426422, "rewards/rule_reward": 0.033203125, "step": 1003, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.6086692937253714, "grad_norm": 1.471015453338623, "kl": 47.125, "learning_rate": 8.157838914341064e-06, "loss": 0.0471, "reward": 0.0035324827767908573, "reward_std": 0.12457122653722763, "rewards/ndcg_rule_reward": -0.02576439268887043, "rewards/rule_reward": 0.029296875, "step": 1004, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6092755380418309, "grad_norm": 1.5231815576553345, "kl": 54.75, "learning_rate": 8.154032736629324e-06, "loss": 0.0548, "reward": 0.00515999342314899, "reward_std": 0.14905411377549171, "rewards/ndcg_rule_reward": -0.02999625727534294, "rewards/rule_reward": 0.03515625, "step": 1005, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6098817823582904, "grad_norm": 1.9317418336868286, "kl": 19.5625, "learning_rate": 8.15022352087009e-06, "loss": 0.0195, "reward": 0.0025850431993603706, "reward_std": 0.09137151390314102, "rewards/ndcg_rule_reward": -0.018899332266300917, "rewards/rule_reward": 0.021484375, "step": 1006, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6104880266747499, "grad_norm": 0.975260853767395, "kl": 22.5, "learning_rate": 8.146411270732496e-06, "loss": 0.0224, "reward": 0.002936622011475265, "reward_std": 0.09119896963238716, "rewards/ndcg_rule_reward": -0.018547752872109413, "rewards/rule_reward": 0.021484375, "step": 1007, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6110942709912095, "grad_norm": 1.7317585945129395, "kl": 21.23046875, "learning_rate": 8.142595989888608e-06, "loss": 0.0213, "reward": 0.001930518716108054, "reward_std": 0.08326582983136177, "rewards/ndcg_rule_reward": -0.017600731924176216, "rewards/rule_reward": 0.01953125, "step": 1008, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.611700515307669, "grad_norm": 2.717249631881714, "kl": 38.1875, "learning_rate": 8.138777682013403e-06, "loss": 0.0382, "reward": 0.004453050438314676, "reward_std": 0.16627074778079987, "rewards/ndcg_rule_reward": -0.034609450958669186, "rewards/rule_reward": 0.0390625, "step": 1009, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6123067596241285, "grad_norm": 1.8839011192321777, "kl": 33.28125, "learning_rate": 8.134956350784775e-06, "loss": 0.0333, "reward": 0.0037109192926436663, "reward_std": 0.1413348987698555, "rewards/ndcg_rule_reward": -0.029492206871509552, "rewards/rule_reward": 0.033203125, "step": 1010, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.612913003940588, "grad_norm": 1.4116973876953125, "kl": 47.0, "learning_rate": 8.131131999883528e-06, "loss": 0.0471, "reward": 0.004461031872779131, "reward_std": 0.12414920330047607, "rewards/ndcg_rule_reward": -0.024835843592882156, "rewards/rule_reward": 0.029296875, "step": 1011, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6135192482570476, "grad_norm": 1.6755609512329102, "kl": 48.375, "learning_rate": 8.127304632993383e-06, "loss": 0.0483, "reward": 0.0036025824956595898, "reward_std": 0.09928901493549347, "rewards/ndcg_rule_reward": -0.019834917970001698, "rewards/rule_reward": 0.0234375, "step": 1012, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6141254925735071, "grad_norm": 1.4649089574813843, "kl": 50.375, "learning_rate": 8.123474253800958e-06, "loss": 0.0504, "reward": 0.004767544101923704, "reward_std": 0.1408531442284584, "rewards/ndcg_rule_reward": -0.028435581363737583, "rewards/rule_reward": 0.033203125, "step": 1013, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6147317368899666, "grad_norm": 1.529553771018982, "kl": 23.3125, "learning_rate": 8.119640865995774e-06, "loss": 0.0233, "reward": 0.0027435626834630966, "reward_std": 0.09971332922577858, "rewards/ndcg_rule_reward": -0.020693937316536903, "rewards/rule_reward": 0.0234375, "step": 1014, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6153379812064262, "grad_norm": 2.1423521041870117, "kl": 56.75, "learning_rate": 8.115804473270253e-06, "loss": 0.0568, "reward": 0.00446674763225019, "reward_std": 0.13258085399866104, "rewards/ndcg_rule_reward": -0.026783251203596592, "rewards/rule_reward": 0.03125, "step": 1015, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6159442255228857, "grad_norm": 1.7411922216415405, "kl": 41.9375, "learning_rate": 8.11196507931971e-06, "loss": 0.042, "reward": 0.004394362447783351, "reward_std": 0.13260313495993614, "rewards/ndcg_rule_reward": -0.02685563825070858, "rewards/rule_reward": 0.03125, "step": 1016, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6165504698393452, "grad_norm": 2.014302968978882, "kl": 27.25, "learning_rate": 8.108122687842349e-06, "loss": 0.0272, "reward": 0.0030649632681161165, "reward_std": 0.10794994980096817, "rewards/ndcg_rule_reward": -0.022325661964714527, "rewards/rule_reward": 0.025390625, "step": 1017, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6171567141558048, "grad_norm": 1.554877758026123, "kl": 41.6875, "learning_rate": 8.104277302539264e-06, "loss": 0.0417, "reward": 0.0033723379019647837, "reward_std": 0.10782691463828087, "rewards/ndcg_rule_reward": -0.02201828733086586, "rewards/rule_reward": 0.025390625, "step": 1018, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6177629584722644, "grad_norm": 3.661560535430908, "kl": 24.1875, "learning_rate": 8.100428927114432e-06, "loss": 0.0241, "reward": 0.0028751561767421663, "reward_std": 0.10804271325469017, "rewards/ndcg_rule_reward": -0.022515468299388885, "rewards/rule_reward": 0.025390625, "step": 1019, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6183692027887239, "grad_norm": 1.4750897884368896, "kl": 24.4375, "learning_rate": 8.096577565274707e-06, "loss": 0.0244, "reward": 0.0037803981686010957, "reward_std": 0.1497245728969574, "rewards/ndcg_rule_reward": -0.03137585148215294, "rewards/rule_reward": 0.03515625, "step": 1020, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6189754471051834, "grad_norm": 1.3713645935058594, "kl": 32.375, "learning_rate": 8.092723220729826e-06, "loss": 0.0323, "reward": 0.0027332957834005356, "reward_std": 0.09129369258880615, "rewards/ndcg_rule_reward": -0.01875108014792204, "rewards/rule_reward": 0.021484375, "step": 1021, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6195816914216429, "grad_norm": 1.175461769104004, "kl": 51.375, "learning_rate": 8.088865897192391e-06, "loss": 0.0514, "reward": 0.0030165895586833358, "reward_std": 0.08274522796273232, "rewards/ndcg_rule_reward": -0.016514661256223917, "rewards/rule_reward": 0.01953125, "step": 1022, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6201879357381025, "grad_norm": 1.8211524486541748, "kl": 23.0, "learning_rate": 8.08500559837788e-06, "loss": 0.023, "reward": 0.002995468210428953, "reward_std": 0.09957563132047653, "rewards/ndcg_rule_reward": -0.020442032255232334, "rewards/rule_reward": 0.0234375, "step": 1023, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.620794180054562, "grad_norm": 1.8011376857757568, "kl": 21.96875, "learning_rate": 8.081142328004638e-06, "loss": 0.0219, "reward": 0.0028133634477853775, "reward_std": 0.11648687720298767, "rewards/ndcg_rule_reward": -0.024530387483537197, "rewards/rule_reward": 0.02734375, "step": 1024, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6214004243710215, "grad_norm": 3.2323288917541504, "kl": 2.6875, "learning_rate": 8.077276089793862e-06, "loss": 0.0027, "reward": 0.004137198557145894, "reward_std": 0.13271105289459229, "rewards/ndcg_rule_reward": -0.027112800627946854, "rewards/rule_reward": 0.03125, "step": 1025, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.622006668687481, "grad_norm": 1.8067911863327026, "kl": 0.775390625, "learning_rate": 8.07340688746962e-06, "loss": 0.0008, "reward": 0.0024953146930783987, "reward_std": 0.0745740756392479, "rewards/ndcg_rule_reward": -0.015082810074090958, "rewards/rule_reward": 0.017578125, "step": 1026, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6226129130039406, "grad_norm": 2.475598096847534, "kl": 2.3359375, "learning_rate": 8.069534724758828e-06, "loss": 0.0023, "reward": 0.003607077756896615, "reward_std": 0.14133064076304436, "rewards/ndcg_rule_reward": -0.02959604561328888, "rewards/rule_reward": 0.033203125, "step": 1027, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6232191573204001, "grad_norm": 3.086582899093628, "kl": 1.8515625, "learning_rate": 8.065659605391253e-06, "loss": 0.0019, "reward": 0.003915756708011031, "reward_std": 0.12441319972276688, "rewards/ndcg_rule_reward": -0.025381118059158325, "rewards/rule_reward": 0.029296875, "step": 1028, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6238254016368596, "grad_norm": 2.7282488346099854, "kl": 2.953125, "learning_rate": 8.061781533099516e-06, "loss": 0.003, "reward": 0.003221652237698436, "reward_std": 0.11633945256471634, "rewards/ndcg_rule_reward": -0.024122098460793495, "rewards/rule_reward": 0.02734375, "step": 1029, "token_diversity": 0.375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6244316459533192, "grad_norm": 1.621370792388916, "kl": 3.8359375, "learning_rate": 8.057900511619077e-06, "loss": 0.0038, "reward": 0.0023444101097993553, "reward_std": 0.09147882834076881, "rewards/ndcg_rule_reward": -0.019139965064823627, "rewards/rule_reward": 0.021484375, "step": 1030, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6250378902697787, "grad_norm": 1.8469643592834473, "kl": 4.28125, "learning_rate": 8.054016544688236e-06, "loss": 0.0043, "reward": 0.0023969279136508703, "reward_std": 0.08303723856806755, "rewards/ndcg_rule_reward": -0.01713432092219591, "rewards/rule_reward": 0.01953125, "step": 1031, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6256441345862382, "grad_norm": 1.8731112480163574, "kl": 5.3125, "learning_rate": 8.050129636048137e-06, "loss": 0.0053, "reward": 0.0031825893092900515, "reward_std": 0.11637059226632118, "rewards/ndcg_rule_reward": -0.024161160923540592, "rewards/rule_reward": 0.02734375, "step": 1032, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6262503789026977, "grad_norm": 2.1299703121185303, "kl": 15.15625, "learning_rate": 8.04623978944275e-06, "loss": 0.0152, "reward": 0.003015615278854966, "reward_std": 0.09957102686166763, "rewards/ndcg_rule_reward": -0.020421884953975677, "rewards/rule_reward": 0.0234375, "step": 1033, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6268566232191574, "grad_norm": 2.7651822566986084, "kl": 25.5625, "learning_rate": 8.042347008618873e-06, "loss": 0.0255, "reward": 0.0030242810025811195, "reward_std": 0.10799112543463707, "rewards/ndcg_rule_reward": -0.02236634399741888, "rewards/rule_reward": 0.025390625, "step": 1034, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6274628675356169, "grad_norm": 2.230407238006592, "kl": 20.125, "learning_rate": 8.038451297326146e-06, "loss": 0.0201, "reward": 0.0028830941300839186, "reward_std": 0.09964124113321304, "rewards/ndcg_rule_reward": -0.020554406568408012, "rewards/rule_reward": 0.0234375, "step": 1035, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6280691118520764, "grad_norm": 2.630277395248413, "kl": 12.234375, "learning_rate": 8.034552659317012e-06, "loss": 0.0122, "reward": 0.0026557851815596223, "reward_std": 0.10814327374100685, "rewards/ndcg_rule_reward": -0.022734840400516987, "rewards/rule_reward": 0.025390625, "step": 1036, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6286753561685359, "grad_norm": 4.437758445739746, "kl": 35.75, "learning_rate": 8.030651098346746e-06, "loss": 0.0357, "reward": 0.0026125116273760796, "reward_std": 0.09135843068361282, "rewards/ndcg_rule_reward": -0.01887186337262392, "rewards/rule_reward": 0.021484375, "step": 1037, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6292816004849955, "grad_norm": 12.498600006103516, "kl": 78.75, "learning_rate": 8.026746618173432e-06, "loss": 0.0787, "reward": 0.005078120622783899, "reward_std": 0.1491173952817917, "rewards/ndcg_rule_reward": -0.030078129842877388, "rewards/rule_reward": 0.03515625, "step": 1038, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.629887844801455, "grad_norm": 4.868231773376465, "kl": 78.4375, "learning_rate": 8.02283922255797e-06, "loss": 0.0784, "reward": 0.003705049864947796, "reward_std": 0.1329154670238495, "rewards/ndcg_rule_reward": -0.02754495106637478, "rewards/rule_reward": 0.03125, "step": 1039, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6304940891179145, "grad_norm": 1.9155113697052002, "kl": 37.171875, "learning_rate": 8.018928915264065e-06, "loss": 0.0372, "reward": 0.0027147793443873525, "reward_std": 0.091293565928936, "rewards/ndcg_rule_reward": -0.01876959577202797, "rewards/rule_reward": 0.021484375, "step": 1040, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6311003334343741, "grad_norm": 2.716113805770874, "kl": 30.5625, "learning_rate": 8.015015700058231e-06, "loss": 0.0306, "reward": 0.003696926636621356, "reward_std": 0.14134283363819122, "rewards/ndcg_rule_reward": -0.029506197199225426, "rewards/rule_reward": 0.033203125, "step": 1041, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6317065777508336, "grad_norm": 2.0860326290130615, "kl": 32.625, "learning_rate": 8.011099580709777e-06, "loss": 0.0326, "reward": 0.0036455453373491764, "reward_std": 0.13295892998576164, "rewards/ndcg_rule_reward": -0.02760445512831211, "rewards/rule_reward": 0.03125, "step": 1042, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.6323128220672931, "grad_norm": 5.210360050201416, "kl": 37.6875, "learning_rate": 8.007180560990814e-06, "loss": 0.0377, "reward": 0.004041098523885012, "reward_std": 0.15804223716259003, "rewards/ndcg_rule_reward": -0.033068276941776276, "rewards/rule_reward": 0.037109375, "step": 1043, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6329190663837526, "grad_norm": 4.698660373687744, "kl": 42.875, "learning_rate": 8.003258644676246e-06, "loss": 0.0429, "reward": 0.0033920633140951395, "reward_std": 0.13310562819242477, "rewards/ndcg_rule_reward": -0.02785793785005808, "rewards/rule_reward": 0.03125, "step": 1044, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6335253107002122, "grad_norm": 1.6497761011123657, "kl": 18.7900390625, "learning_rate": 7.999333835543764e-06, "loss": 0.0188, "reward": 0.002932338509708643, "reward_std": 0.09962983429431915, "rewards/ndcg_rule_reward": -0.02050516102463007, "rewards/rule_reward": 0.0234375, "step": 1045, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6341315550166717, "grad_norm": 1.8272892236709595, "kl": 25.3125, "learning_rate": 7.995406137373848e-06, "loss": 0.0252, "reward": 0.0044399285688996315, "reward_std": 0.14945083111524582, "rewards/ndcg_rule_reward": -0.03071632143110037, "rewards/rule_reward": 0.03515625, "step": 1046, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6347377993331312, "grad_norm": 1.4338146448135376, "kl": 10.265625, "learning_rate": 7.991475553949759e-06, "loss": 0.0103, "reward": 0.0018887263722717762, "reward_std": 0.07485619559884071, "rewards/ndcg_rule_reward": -0.01568939909338951, "rewards/rule_reward": 0.017578125, "step": 1047, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6353440436495907, "grad_norm": 1.5461769104003906, "kl": 5.7890625, "learning_rate": 7.987542089057543e-06, "loss": 0.0058, "reward": 0.001609616563655436, "reward_std": 0.05817972496151924, "rewards/ndcg_rule_reward": -0.012062259018421173, "rewards/rule_reward": 0.013671875, "step": 1048, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 6.822265625, "epoch": 0.6359502879660504, "grad_norm": 3.2953591346740723, "kl": 5.890625, "learning_rate": 7.983605746486014e-06, "loss": 0.0059, "reward": 0.0028138075722381473, "reward_std": 0.1080968976020813, "rewards/ndcg_rule_reward": -0.02257681731134653, "rewards/rule_reward": 0.025390625, "step": 1049, "token_diversity": 0.3179290254237288 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6365565322825099, "grad_norm": 3.406567096710205, "kl": 20.978515625, "learning_rate": 7.979666530026762e-06, "loss": 0.0209, "reward": 0.0012089446536265314, "reward_std": 0.05834144353866577, "rewards/ndcg_rule_reward": -0.01246292982250452, "rewards/rule_reward": 0.013671875, "step": 1050, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.6371627765989694, "grad_norm": 2.6738359928131104, "kl": 7.6806640625, "learning_rate": 7.975724443474143e-06, "loss": 0.0077, "reward": 0.0030036038951948285, "reward_std": 0.10800280421972275, "rewards/ndcg_rule_reward": -0.02238702028989792, "rewards/rule_reward": 0.025390625, "step": 1051, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6377690209154289, "grad_norm": 2.0573596954345703, "kl": 14.0, "learning_rate": 7.971779490625281e-06, "loss": 0.014, "reward": 0.003529972513206303, "reward_std": 0.10774512961506844, "rewards/ndcg_rule_reward": -0.02186065260320902, "rewards/rule_reward": 0.025390625, "step": 1052, "token_diversity": 0.35546875 }, { "categorical_diversity": 1.0, "completion_length": 9.564453125, "epoch": 0.6383752652318885, "grad_norm": 1.8709887266159058, "kl": 14.0625, "learning_rate": 7.967831675280056e-06, "loss": 0.0141, "reward": 0.003775448421947658, "reward_std": 0.12446209043264389, "rewards/ndcg_rule_reward": -0.025521425530314445, "rewards/rule_reward": 0.029296875, "step": 1053, "token_diversity": 0.3284254807692308 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.638981509548348, "grad_norm": 2.3055427074432373, "kl": 12.40625, "learning_rate": 7.963881001241106e-06, "loss": 0.0124, "reward": 0.004242069786414504, "reward_std": 0.12428352981805801, "rewards/ndcg_rule_reward": -0.025054804980754852, "rewards/rule_reward": 0.029296875, "step": 1054, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.6395877538648075, "grad_norm": 1.4231483936309814, "kl": 14.15625, "learning_rate": 7.959927472313832e-06, "loss": 0.0141, "reward": 0.0032257097773253918, "reward_std": 0.1247369833290577, "rewards/ndcg_rule_reward": -0.02607116475701332, "rewards/rule_reward": 0.029296875, "step": 1055, "token_diversity": 0.29475 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.6401939981812671, "grad_norm": 3.2081291675567627, "kl": 22.5, "learning_rate": 7.955971092306371e-06, "loss": 0.0225, "reward": 0.003959976835176349, "reward_std": 0.12437137216329575, "rewards/ndcg_rule_reward": -0.025336898863315582, "rewards/rule_reward": 0.029296875, "step": 1056, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6408002424977266, "grad_norm": 2.049710512161255, "kl": 12.6875, "learning_rate": 7.952011865029614e-06, "loss": 0.0127, "reward": 0.0037391323130577803, "reward_std": 0.10764718800783157, "rewards/ndcg_rule_reward": -0.02165149338543415, "rewards/rule_reward": 0.025390625, "step": 1057, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6414064868141861, "grad_norm": 1.2274523973464966, "kl": 22.0625, "learning_rate": 7.948049794297192e-06, "loss": 0.0221, "reward": 0.003259327379055321, "reward_std": 0.099465511739254, "rewards/ndcg_rule_reward": -0.020178173668682575, "rewards/rule_reward": 0.0234375, "step": 1058, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.6420127311306456, "grad_norm": 8.620495796203613, "kl": 39.0625, "learning_rate": 7.94408488392548e-06, "loss": 0.0391, "reward": 0.0021722892415709794, "reward_std": 0.08313588052988052, "rewards/ndcg_rule_reward": -0.017358961515128613, "rewards/rule_reward": 0.01953125, "step": 1059, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6426189754471052, "grad_norm": 2.3004188537597656, "kl": 45.875, "learning_rate": 7.94011713773358e-06, "loss": 0.046, "reward": 0.0032304335618391633, "reward_std": 0.13311633095145226, "rewards/ndcg_rule_reward": -0.028019567020237446, "rewards/rule_reward": 0.03125, "step": 1060, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.6432252197635647, "grad_norm": 1.9777621030807495, "kl": 26.3125, "learning_rate": 7.93614655954333e-06, "loss": 0.0263, "reward": 0.0034350211499258876, "reward_std": 0.11623290926218033, "rewards/ndcg_rule_reward": -0.023908729664981365, "rewards/rule_reward": 0.02734375, "step": 1061, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6438314640800242, "grad_norm": 1.299952745437622, "kl": 5.072265625, "learning_rate": 7.9321731531793e-06, "loss": 0.005, "reward": 0.0022837818833068013, "reward_std": 0.08310055732727051, "rewards/ndcg_rule_reward": -0.01724746823310852, "rewards/rule_reward": 0.01953125, "step": 1062, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.6444377083964837, "grad_norm": 4.826422214508057, "kl": 54.125, "learning_rate": 7.928196922468772e-06, "loss": 0.0541, "reward": 0.0035741421161219478, "reward_std": 0.11616676300764084, "rewards/ndcg_rule_reward": -0.023769608698785305, "rewards/rule_reward": 0.02734375, "step": 1063, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6450439527129433, "grad_norm": 1.584441900253296, "kl": 37.625, "learning_rate": 7.924217871241763e-06, "loss": 0.0377, "reward": 0.003152723307721317, "reward_std": 0.09952306002378464, "rewards/ndcg_rule_reward": -0.020284777507185936, "rewards/rule_reward": 0.0234375, "step": 1064, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6456501970294029, "grad_norm": 1.885551929473877, "kl": 42.25, "learning_rate": 7.920236003330999e-06, "loss": 0.0422, "reward": 0.0045477699022740126, "reward_std": 0.13252615556120872, "rewards/ndcg_rule_reward": -0.02670223079621792, "rewards/rule_reward": 0.03125, "step": 1065, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.6462564413458624, "grad_norm": 1.7199715375900269, "kl": 46.0, "learning_rate": 7.916251322571918e-06, "loss": 0.046, "reward": 0.0038479273207485676, "reward_std": 0.11601004749536514, "rewards/ndcg_rule_reward": -0.02349582314491272, "rewards/rule_reward": 0.02734375, "step": 1066, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.646862685662322, "grad_norm": 1.4412554502487183, "kl": 20.09375, "learning_rate": 7.912263832802671e-06, "loss": 0.0201, "reward": 0.002496986708138138, "reward_std": 0.08300111442804337, "rewards/ndcg_rule_reward": -0.017034263350069523, "rewards/rule_reward": 0.01953125, "step": 1067, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6474689299787815, "grad_norm": 2.0858664512634277, "kl": 27.5, "learning_rate": 7.908273537864114e-06, "loss": 0.0276, "reward": 0.0030476609244942665, "reward_std": 0.1164201907813549, "rewards/ndcg_rule_reward": -0.024296089075505733, "rewards/rule_reward": 0.02734375, "step": 1068, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.648075174295241, "grad_norm": 1.8773126602172852, "kl": 40.875, "learning_rate": 7.904280441599802e-06, "loss": 0.0409, "reward": 0.0038686422631144524, "reward_std": 0.13285984098911285, "rewards/ndcg_rule_reward": -0.027381357736885548, "rewards/rule_reward": 0.03125, "step": 1069, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6486814186117005, "grad_norm": 2.415571928024292, "kl": 23.8125, "learning_rate": 7.900284547855992e-06, "loss": 0.0238, "reward": 0.00369166093878448, "reward_std": 0.14134222269058228, "rewards/ndcg_rule_reward": -0.02951146475970745, "rewards/rule_reward": 0.033203125, "step": 1070, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6492876629281601, "grad_norm": 1.2631419897079468, "kl": 31.625, "learning_rate": 7.896285860481633e-06, "loss": 0.0315, "reward": 0.0029429395217448473, "reward_std": 0.09962069615721703, "rewards/ndcg_rule_reward": -0.020494560711085796, "rewards/rule_reward": 0.0234375, "step": 1071, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6498939072446196, "grad_norm": 1.9867761135101318, "kl": 34.0625, "learning_rate": 7.892284383328367e-06, "loss": 0.0341, "reward": 0.004283652175217867, "reward_std": 0.1326388381421566, "rewards/ndcg_rule_reward": -0.026966347359120846, "rewards/rule_reward": 0.03125, "step": 1072, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6505001515610791, "grad_norm": 2.510725498199463, "kl": 33.9375, "learning_rate": 7.888280120250525e-06, "loss": 0.0339, "reward": 0.004336262121796608, "reward_std": 0.14101384580135345, "rewards/ndcg_rule_reward": -0.02886686474084854, "rewards/rule_reward": 0.033203125, "step": 1073, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6511063958775386, "grad_norm": 2.2042548656463623, "kl": 15.6875, "learning_rate": 7.884273075105115e-06, "loss": 0.0157, "reward": 0.0034708420280367136, "reward_std": 0.13301177695393562, "rewards/ndcg_rule_reward": -0.02777915820479393, "rewards/rule_reward": 0.03125, "step": 1074, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.6517126401939982, "grad_norm": 3.951683759689331, "kl": 43.25, "learning_rate": 7.88026325175183e-06, "loss": 0.0432, "reward": 0.0040599601925350726, "reward_std": 0.11594151705503464, "rewards/ndcg_rule_reward": -0.02328378986567259, "rewards/rule_reward": 0.02734375, "step": 1075, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6523188845104577, "grad_norm": 2.709657669067383, "kl": 42.5, "learning_rate": 7.876250654053037e-06, "loss": 0.0425, "reward": 0.0029669758514501154, "reward_std": 0.11647269129753113, "rewards/ndcg_rule_reward": -0.024376774206757545, "rewards/rule_reward": 0.02734375, "step": 1076, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6529251288269172, "grad_norm": 1.6282787322998047, "kl": 23.625, "learning_rate": 7.872235285873775e-06, "loss": 0.0236, "reward": 0.00426122872158885, "reward_std": 0.15790078788995743, "rewards/ndcg_rule_reward": -0.03284814581274986, "rewards/rule_reward": 0.037109375, "step": 1077, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6535313731433767, "grad_norm": 1.587686538696289, "kl": 15.375, "learning_rate": 7.868217151081757e-06, "loss": 0.0154, "reward": 0.0032342495396733284, "reward_std": 0.11633391678333282, "rewards/ndcg_rule_reward": -0.024109501391649246, "rewards/rule_reward": 0.02734375, "step": 1078, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6541376174598363, "grad_norm": 3.1498210430145264, "kl": 42.6875, "learning_rate": 7.86419625354735e-06, "loss": 0.0426, "reward": 0.0045736117754131556, "reward_std": 0.15780041739344597, "rewards/ndcg_rule_reward": -0.032535762526094913, "rewards/rule_reward": 0.037109375, "step": 1079, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6547438617762958, "grad_norm": 2.199197292327881, "kl": 25.8125, "learning_rate": 7.860172597143593e-06, "loss": 0.0259, "reward": 0.002922971034422517, "reward_std": 0.09963486343622208, "rewards/ndcg_rule_reward": -0.02051452873274684, "rewards/rule_reward": 0.0234375, "step": 1080, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6553501060927553, "grad_norm": 1.8198360204696655, "kl": 19.5, "learning_rate": 7.856146185746175e-06, "loss": 0.0195, "reward": 0.0035475371405482292, "reward_std": 0.1245575025677681, "rewards/ndcg_rule_reward": -0.02574933785945177, "rewards/rule_reward": 0.029296875, "step": 1081, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.655956350409215, "grad_norm": 1.5480130910873413, "kl": 19.875, "learning_rate": 7.852117023233445e-06, "loss": 0.0199, "reward": 0.0038098612567409873, "reward_std": 0.12448315694928169, "rewards/ndcg_rule_reward": -0.025487014092504978, "rewards/rule_reward": 0.029296875, "step": 1082, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.6565625947256745, "grad_norm": 1.7058883905410767, "kl": 23.8125, "learning_rate": 7.848085113486397e-06, "loss": 0.0238, "reward": 0.0037231036694720387, "reward_std": 0.13290438801050186, "rewards/ndcg_rule_reward": -0.027526896446943283, "rewards/rule_reward": 0.03125, "step": 1083, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.657168839042134, "grad_norm": 1.620418906211853, "kl": 23.1875, "learning_rate": 7.844050460388672e-06, "loss": 0.0232, "reward": 0.0032821811037138104, "reward_std": 0.10788209736347198, "rewards/ndcg_rule_reward": -0.02210844401270151, "rewards/rule_reward": 0.025390625, "step": 1084, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6577750833585935, "grad_norm": 2.6594512462615967, "kl": 33.1875, "learning_rate": 7.840013067826553e-06, "loss": 0.0332, "reward": 0.0041475489269942045, "reward_std": 0.13270894810557365, "rewards/ndcg_rule_reward": -0.027102451771497726, "rewards/rule_reward": 0.03125, "step": 1085, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 8.84375, "epoch": 0.6583813276750531, "grad_norm": 1.672900676727295, "kl": 34.125, "learning_rate": 7.835972939688968e-06, "loss": 0.0341, "reward": 0.003164405527058989, "reward_std": 0.09112159162759781, "rewards/ndcg_rule_reward": -0.01831996999680996, "rewards/rule_reward": 0.021484375, "step": 1086, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6589875719915126, "grad_norm": 2.0236077308654785, "kl": 18.3125, "learning_rate": 7.83193007986747e-06, "loss": 0.0183, "reward": 0.0022016222355887294, "reward_std": 0.09156234189867973, "rewards/ndcg_rule_reward": -0.019282753113657236, "rewards/rule_reward": 0.021484375, "step": 1087, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6595938163079721, "grad_norm": 1.977866530418396, "kl": 41.75, "learning_rate": 7.82788449225625e-06, "loss": 0.0417, "reward": 0.00393056811299175, "reward_std": 0.12441453337669373, "rewards/ndcg_rule_reward": -0.02536630630493164, "rewards/rule_reward": 0.029296875, "step": 1088, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6602000606244316, "grad_norm": 2.0701074600219727, "kl": 59.0, "learning_rate": 7.823836180752125e-06, "loss": 0.0591, "reward": 0.004847432486712933, "reward_std": 0.15761573612689972, "rewards/ndcg_rule_reward": -0.03226194251328707, "rewards/rule_reward": 0.037109375, "step": 1089, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 7.373046875, "epoch": 0.6608063049408912, "grad_norm": 1.4338467121124268, "kl": 36.75, "learning_rate": 7.819785149254534e-06, "loss": 0.0368, "reward": 0.004463844001293182, "reward_std": 0.13257621973752975, "rewards/ndcg_rule_reward": -0.026786155998706818, "rewards/rule_reward": 0.03125, "step": 1090, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.6614125492573507, "grad_norm": 2.1623899936676025, "kl": 45.0, "learning_rate": 7.815731401665538e-06, "loss": 0.045, "reward": 0.0037011801032349467, "reward_std": 0.13291162997484207, "rewards/ndcg_rule_reward": -0.02754881978034973, "rewards/rule_reward": 0.03125, "step": 1091, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.6620187935738102, "grad_norm": 2.035214424133301, "kl": 41.25, "learning_rate": 7.811674941889815e-06, "loss": 0.0412, "reward": 0.004993153968825936, "reward_std": 0.16597019135951996, "rewards/ndcg_rule_reward": -0.03406934812664986, "rewards/rule_reward": 0.0390625, "step": 1092, "token_diversity": 0.31456249999999997 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6626250378902698, "grad_norm": 1.8758511543273926, "kl": 30.5625, "learning_rate": 7.807615773834653e-06, "loss": 0.0306, "reward": 0.003906654193997383, "reward_std": 0.12444719299674034, "rewards/ndcg_rule_reward": -0.02539022173732519, "rewards/rule_reward": 0.029296875, "step": 1093, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6632312822067293, "grad_norm": 3.026345729827881, "kl": 49.5, "learning_rate": 7.803553901409948e-06, "loss": 0.0497, "reward": 0.0036612889962270856, "reward_std": 0.13295524567365646, "rewards/ndcg_rule_reward": -0.02758871205151081, "rewards/rule_reward": 0.03125, "step": 1094, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.6638375265231888, "grad_norm": 1.459172010421753, "kl": 29.5625, "learning_rate": 7.799489328528205e-06, "loss": 0.0296, "reward": 0.002690065070055425, "reward_std": 0.08290820568799973, "rewards/ndcg_rule_reward": -0.016841184347867966, "rewards/rule_reward": 0.01953125, "step": 1095, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6644437708396483, "grad_norm": 3.2180116176605225, "kl": 38.0, "learning_rate": 7.795422059104528e-06, "loss": 0.0379, "reward": 0.0033082523150369525, "reward_std": 0.11627284809947014, "rewards/ndcg_rule_reward": -0.024035497568547726, "rewards/rule_reward": 0.02734375, "step": 1096, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.665050015156108, "grad_norm": 1.6341252326965332, "kl": 26.6875, "learning_rate": 7.791352097056616e-06, "loss": 0.0267, "reward": 0.003283147932961583, "reward_std": 0.09945900738239288, "rewards/ndcg_rule_reward": -0.02015435229986906, "rewards/rule_reward": 0.0234375, "step": 1097, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.6656562594725675, "grad_norm": 1.665734052658081, "kl": 15.15625, "learning_rate": 7.787279446304765e-06, "loss": 0.0152, "reward": 0.002796939807012677, "reward_std": 0.09970229119062424, "rewards/ndcg_rule_reward": -0.020640560425817966, "rewards/rule_reward": 0.0234375, "step": 1098, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.666262503789027, "grad_norm": 2.8254342079162598, "kl": 33.1875, "learning_rate": 7.78320411077186e-06, "loss": 0.0331, "reward": 0.0035307303769513965, "reward_std": 0.11616990715265274, "rewards/ndcg_rule_reward": -0.02381301950663328, "rewards/rule_reward": 0.02734375, "step": 1099, "token_diversity": 0.3125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6668687481054865, "grad_norm": 1.6800376176834106, "kl": 13.796875, "learning_rate": 7.779126094383371e-06, "loss": 0.0138, "reward": 0.0034630249720066786, "reward_std": 0.13302037864923477, "rewards/ndcg_rule_reward": -0.027786974795162678, "rewards/rule_reward": 0.03125, "step": 1100, "token_diversity": 0.5390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6674749924219461, "grad_norm": 2.5006918907165527, "kl": 6.1875, "learning_rate": 7.77504540106735e-06, "loss": 0.0062, "reward": 0.002905164088588208, "reward_std": 0.09966735541820526, "rewards/ndcg_rule_reward": -0.020532336551696062, "rewards/rule_reward": 0.0234375, "step": 1101, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6680812367384056, "grad_norm": 1.3831433057785034, "kl": 36.0, "learning_rate": 7.77096203475443e-06, "loss": 0.0359, "reward": 0.003828902030363679, "reward_std": 0.11601465940475464, "rewards/ndcg_rule_reward": -0.023514848202466965, "rewards/rule_reward": 0.02734375, "step": 1102, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6686874810548651, "grad_norm": 3.674030303955078, "kl": 44.3125, "learning_rate": 7.766875999377816e-06, "loss": 0.0443, "reward": 0.0024830379988998175, "reward_std": 0.0745764710009098, "rewards/ndcg_rule_reward": -0.0150950881652534, "rewards/rule_reward": 0.017578125, "step": 1103, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.6692937253713247, "grad_norm": 2.3015496730804443, "kl": 16.71875, "learning_rate": 7.762787298873287e-06, "loss": 0.0167, "reward": 0.003164658206515014, "reward_std": 0.0910908654332161, "rewards/ndcg_rule_reward": -0.018319716677069664, "rewards/rule_reward": 0.021484375, "step": 1104, "token_diversity": 0.3879654255319149 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.6698999696877842, "grad_norm": 3.039923667907715, "kl": 34.0625, "learning_rate": 7.758695937179185e-06, "loss": 0.0341, "reward": 0.00343125790823251, "reward_std": 0.11620161682367325, "rewards/ndcg_rule_reward": -0.023912492208182812, "rewards/rule_reward": 0.02734375, "step": 1105, "token_diversity": 0.29321961009174313 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6705062140042437, "grad_norm": 3.0898008346557617, "kl": 33.3125, "learning_rate": 7.754601918236417e-06, "loss": 0.0333, "reward": 0.0038818970788270235, "reward_std": 0.1412883773446083, "rewards/ndcg_rule_reward": -0.02932122815400362, "rewards/rule_reward": 0.033203125, "step": 1106, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6711124583207032, "grad_norm": 1.1718164682388306, "kl": 32.1875, "learning_rate": 7.750505245988453e-06, "loss": 0.0321, "reward": 0.004082134808413684, "reward_std": 0.1159343495965004, "rewards/ndcg_rule_reward": -0.023261615075170994, "rewards/rule_reward": 0.02734375, "step": 1107, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6717187026371628, "grad_norm": 1.2025525569915771, "kl": 15.46875, "learning_rate": 7.746405924381312e-06, "loss": 0.0155, "reward": 0.0023160403361544013, "reward_std": 0.08305689692497253, "rewards/ndcg_rule_reward": -0.017215209547430277, "rewards/rule_reward": 0.01953125, "step": 1108, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6723249469536223, "grad_norm": 2.3791518211364746, "kl": 20.25, "learning_rate": 7.742303957363571e-06, "loss": 0.0202, "reward": 0.002853238023817539, "reward_std": 0.0912567526102066, "rewards/ndcg_rule_reward": -0.01863113697618246, "rewards/rule_reward": 0.021484375, "step": 1109, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6729311912700818, "grad_norm": 1.5200691223144531, "kl": 24.625, "learning_rate": 7.738199348886353e-06, "loss": 0.0246, "reward": 0.002524918469134718, "reward_std": 0.0745600052177906, "rewards/ndcg_rule_reward": -0.01505320705473423, "rewards/rule_reward": 0.017578125, "step": 1110, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6735374355865413, "grad_norm": 2.3678066730499268, "kl": 23.875, "learning_rate": 7.734092102903323e-06, "loss": 0.0239, "reward": 0.002203091571573168, "reward_std": 0.0831257812678814, "rewards/ndcg_rule_reward": -0.01732815895229578, "rewards/rule_reward": 0.01953125, "step": 1111, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.674143679903001, "grad_norm": 12.815342903137207, "kl": 72.875, "learning_rate": 7.72998222337069e-06, "loss": 0.0729, "reward": 0.0032606960739940405, "reward_std": 0.09948847070336342, "rewards/ndcg_rule_reward": -0.02017680462449789, "rewards/rule_reward": 0.0234375, "step": 1112, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.6747499242194605, "grad_norm": 11.059673309326172, "kl": 40.4375, "learning_rate": 7.7258697142472e-06, "loss": 0.0404, "reward": 0.002754772547632456, "reward_std": 0.0828559547662735, "rewards/ndcg_rule_reward": -0.016776478849351406, "rewards/rule_reward": 0.01953125, "step": 1113, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.67535616853592, "grad_norm": 3.8641836643218994, "kl": 32.78125, "learning_rate": 7.721754579494127e-06, "loss": 0.0328, "reward": 0.0023006758419796824, "reward_std": 0.07469811290502548, "rewards/ndcg_rule_reward": -0.015277449507266283, "rewards/rule_reward": 0.017578125, "step": 1114, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.6759624128523795, "grad_norm": 3.338991165161133, "kl": 28.875, "learning_rate": 7.71763682307528e-06, "loss": 0.0288, "reward": 0.0035656108520925045, "reward_std": 0.11612476408481598, "rewards/ndcg_rule_reward": -0.023778139613568783, "rewards/rule_reward": 0.02734375, "step": 1115, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.6765686571688391, "grad_norm": 1.6243512630462646, "kl": 16.0625, "learning_rate": 7.71351644895699e-06, "loss": 0.0161, "reward": 0.0023042691173031926, "reward_std": 0.08309014141559601, "rewards/ndcg_rule_reward": -0.01722698099911213, "rewards/rule_reward": 0.01953125, "step": 1116, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6771749014852986, "grad_norm": 2.134286880493164, "kl": 32.8125, "learning_rate": 7.709393461108108e-06, "loss": 0.0328, "reward": 0.004314129240810871, "reward_std": 0.12422991544008255, "rewards/ndcg_rule_reward": -0.02498274575918913, "rewards/rule_reward": 0.029296875, "step": 1117, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6777811458017581, "grad_norm": 2.453505277633667, "kl": 18.265625, "learning_rate": 7.705267863500005e-06, "loss": 0.0183, "reward": 0.0030229766271077096, "reward_std": 0.09957040101289749, "rewards/ndcg_rule_reward": -0.020414522849023342, "rewards/rule_reward": 0.0234375, "step": 1118, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6783873901182177, "grad_norm": 1.3927743434906006, "kl": 23.21875, "learning_rate": 7.701139660106569e-06, "loss": 0.0232, "reward": 0.003751521580852568, "reward_std": 0.11607258021831512, "rewards/ndcg_rule_reward": -0.02359222900122404, "rewards/rule_reward": 0.02734375, "step": 1119, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6789936344346772, "grad_norm": 1.552122950553894, "kl": 35.0625, "learning_rate": 7.69700885490419e-06, "loss": 0.0351, "reward": 0.004179840791039169, "reward_std": 0.13270125538110733, "rewards/ndcg_rule_reward": -0.02707015909254551, "rewards/rule_reward": 0.03125, "step": 1120, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6795998787511367, "grad_norm": 2.0493924617767334, "kl": 41.6875, "learning_rate": 7.692875451871771e-06, "loss": 0.0417, "reward": 0.004231736878864467, "reward_std": 0.14111417904496193, "rewards/ndcg_rule_reward": -0.028971388936042786, "rewards/rule_reward": 0.033203125, "step": 1121, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.6802061230675962, "grad_norm": 1.4302327632904053, "kl": 44.9375, "learning_rate": 7.688739454990713e-06, "loss": 0.0448, "reward": 0.003794636228121817, "reward_std": 0.12445210665464401, "rewards/ndcg_rule_reward": -0.025502239353954792, "rewards/rule_reward": 0.029296875, "step": 1122, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6808123673840558, "grad_norm": 2.245603561401367, "kl": 32.25, "learning_rate": 7.68460086824492e-06, "loss": 0.0322, "reward": 0.0029597796383313835, "reward_std": 0.09958646446466446, "rewards/ndcg_rule_reward": -0.020477721001952887, "rewards/rule_reward": 0.0234375, "step": 1123, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.6814186117005153, "grad_norm": 2.411897897720337, "kl": 51.25, "learning_rate": 7.680459695620785e-06, "loss": 0.0513, "reward": 0.004301757202483714, "reward_std": 0.12423455715179443, "rewards/ndcg_rule_reward": -0.02499511744827032, "rewards/rule_reward": 0.029296875, "step": 1124, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6820248560169748, "grad_norm": 3.3324437141418457, "kl": 46.875, "learning_rate": 7.676315941107195e-06, "loss": 0.0469, "reward": 0.0026141441194340587, "reward_std": 0.09136122465133667, "rewards/ndcg_rule_reward": -0.018870231695473194, "rewards/rule_reward": 0.021484375, "step": 1125, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.6826311003334343, "grad_norm": 3.9220287799835205, "kl": 65.3125, "learning_rate": 7.672169608695526e-06, "loss": 0.0654, "reward": 0.003292763140052557, "reward_std": 0.10784987360239029, "rewards/ndcg_rule_reward": -0.022097861394286156, "rewards/rule_reward": 0.025390625, "step": 1126, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6832373446498939, "grad_norm": 1.292702555656433, "kl": 34.625, "learning_rate": 7.668020702379633e-06, "loss": 0.0347, "reward": 0.0036404170095920563, "reward_std": 0.10770619660615921, "rewards/ndcg_rule_reward": -0.02175020845606923, "rewards/rule_reward": 0.025390625, "step": 1127, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6838435889663534, "grad_norm": 1.907912254333496, "kl": 25.640625, "learning_rate": 7.663869226155852e-06, "loss": 0.0256, "reward": 0.0038171778433024883, "reward_std": 0.12448140978813171, "rewards/ndcg_rule_reward": -0.0254796976223588, "rewards/rule_reward": 0.029296875, "step": 1128, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.684449833282813, "grad_norm": 2.036144495010376, "kl": 33.9375, "learning_rate": 7.659715184022993e-06, "loss": 0.0339, "reward": 0.002107715990860015, "reward_std": 0.06632684171199799, "rewards/ndcg_rule_reward": -0.013517285231500864, "rewards/rule_reward": 0.015625, "step": 1129, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 8.123046875, "epoch": 0.6850560775992726, "grad_norm": 1.9230588674545288, "kl": 31.5625, "learning_rate": 7.655558579982341e-06, "loss": 0.0316, "reward": 0.004924377892166376, "reward_std": 0.14076969772577286, "rewards/ndcg_rule_reward": -0.02827874943614006, "rewards/rule_reward": 0.033203125, "step": 1130, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6856623219157321, "grad_norm": 2.017653465270996, "kl": 21.0, "learning_rate": 7.651399418037646e-06, "loss": 0.021, "reward": 0.0026891540037468076, "reward_std": 0.09973633289337158, "rewards/ndcg_rule_reward": -0.020748346112668514, "rewards/rule_reward": 0.0234375, "step": 1131, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6862685662321916, "grad_norm": 1.9229034185409546, "kl": 17.25, "learning_rate": 7.647237702195123e-06, "loss": 0.0173, "reward": 0.002284613496158272, "reward_std": 0.09992656856775284, "rewards/ndcg_rule_reward": -0.021152885630726814, "rewards/rule_reward": 0.0234375, "step": 1132, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6868748105486511, "grad_norm": 1.3471146821975708, "kl": 17.25, "learning_rate": 7.643073436463446e-06, "loss": 0.0172, "reward": 0.0024801537510938942, "reward_std": 0.09982535988092422, "rewards/ndcg_rule_reward": -0.020957346074283123, "rewards/rule_reward": 0.0234375, "step": 1133, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6874810548651107, "grad_norm": 1.7968500852584839, "kl": 25.0625, "learning_rate": 7.638906624853744e-06, "loss": 0.025, "reward": 0.0038081654347479343, "reward_std": 0.1328355148434639, "rewards/ndcg_rule_reward": -0.027441835030913353, "rewards/rule_reward": 0.03125, "step": 1134, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6880872991815702, "grad_norm": 1.383000373840332, "kl": 27.875, "learning_rate": 7.634737271379603e-06, "loss": 0.0279, "reward": 0.004053993616253138, "reward_std": 0.1243416927754879, "rewards/ndcg_rule_reward": -0.025242880918085575, "rewards/rule_reward": 0.029296875, "step": 1135, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6886935434980297, "grad_norm": 10.390534400939941, "kl": 92.625, "learning_rate": 7.630565380057052e-06, "loss": 0.0926, "reward": 0.0040420803707093, "reward_std": 0.12434490025043488, "rewards/ndcg_rule_reward": -0.02525479346513748, "rewards/rule_reward": 0.029296875, "step": 1136, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6892997878144892, "grad_norm": 1.782772183418274, "kl": 10.59375, "learning_rate": 7.6263909549045655e-06, "loss": 0.0106, "reward": 0.0029258265858516097, "reward_std": 0.10802308097481728, "rewards/ndcg_rule_reward": -0.022464798763394356, "rewards/rule_reward": 0.025390625, "step": 1137, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6899060321309488, "grad_norm": 1.7112805843353271, "kl": 26.3125, "learning_rate": 7.622213999943062e-06, "loss": 0.0263, "reward": 0.003124944167211652, "reward_std": 0.1163720190525055, "rewards/ndcg_rule_reward": -0.024218806065618992, "rewards/rule_reward": 0.02734375, "step": 1138, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6905122764474083, "grad_norm": 1.5888053178787231, "kl": 33.1875, "learning_rate": 7.618034519195896e-06, "loss": 0.0331, "reward": 0.00389813759829849, "reward_std": 0.11599492281675339, "rewards/ndcg_rule_reward": -0.023445612750947475, "rewards/rule_reward": 0.02734375, "step": 1139, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6911185207638678, "grad_norm": 1.5835448503494263, "kl": 36.875, "learning_rate": 7.6138525166888514e-06, "loss": 0.0368, "reward": 0.0037925676442682743, "reward_std": 0.12445781752467155, "rewards/ndcg_rule_reward": -0.025504305958747864, "rewards/rule_reward": 0.029296875, "step": 1140, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6917247650803273, "grad_norm": 1.8795007467269897, "kl": 30.15625, "learning_rate": 7.609667996450141e-06, "loss": 0.0302, "reward": 0.002808305434882641, "reward_std": 0.09966146573424339, "rewards/ndcg_rule_reward": -0.020629193633794785, "rewards/rule_reward": 0.0234375, "step": 1141, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6923310093967869, "grad_norm": 1.84165358543396, "kl": 47.3125, "learning_rate": 7.60548096251041e-06, "loss": 0.0475, "reward": 0.004272248945198953, "reward_std": 0.12424638867378235, "rewards/ndcg_rule_reward": -0.025024627335369587, "rewards/rule_reward": 0.029296875, "step": 1142, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6929372537132464, "grad_norm": 1.9447077512741089, "kl": 20.875, "learning_rate": 7.6012914189027164e-06, "loss": 0.0209, "reward": 0.0031997470650821924, "reward_std": 0.11632934957742691, "rewards/ndcg_rule_reward": -0.02414400316774845, "rewards/rule_reward": 0.02734375, "step": 1143, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6935434980297059, "grad_norm": 1.8012787103652954, "kl": 31.5, "learning_rate": 7.597099369662539e-06, "loss": 0.0315, "reward": 0.004528591642156243, "reward_std": 0.14935649931430817, "rewards/ndcg_rule_reward": -0.03062765672802925, "rewards/rule_reward": 0.03515625, "step": 1144, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.033203125, "epoch": 0.6941497423461656, "grad_norm": 6.505764484405518, "kl": 55.3125, "learning_rate": 7.592904818827774e-06, "loss": 0.0552, "reward": 0.0033990800147876143, "reward_std": 0.09938598796725273, "rewards/ndcg_rule_reward": -0.020038421265780926, "rewards/rule_reward": 0.0234375, "step": 1145, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6947559866626251, "grad_norm": 1.7612240314483643, "kl": 33.5, "learning_rate": 7.588707770438719e-06, "loss": 0.0335, "reward": 0.0038571784971281886, "reward_std": 0.1328638792037964, "rewards/ndcg_rule_reward": -0.02739282138645649, "rewards/rule_reward": 0.03125, "step": 1146, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6953622309790846, "grad_norm": 2.9386487007141113, "kl": 47.0, "learning_rate": 7.584508228538086e-06, "loss": 0.047, "reward": 0.004025442525744438, "reward_std": 0.13277436047792435, "rewards/ndcg_rule_reward": -0.027224557474255562, "rewards/rule_reward": 0.03125, "step": 1147, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6959684752955441, "grad_norm": 2.677557945251465, "kl": 51.375, "learning_rate": 7.580306197170979e-06, "loss": 0.0514, "reward": 0.005061861127614975, "reward_std": 0.15758126974105835, "rewards/ndcg_rule_reward": -0.0320475148037076, "rewards/rule_reward": 0.037109375, "step": 1148, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.6965747196120037, "grad_norm": 1.8094522953033447, "kl": 61.25, "learning_rate": 7.576101680384907e-06, "loss": 0.0613, "reward": 0.004132717614993453, "reward_std": 0.11588554084300995, "rewards/ndcg_rule_reward": -0.023211033549159765, "rewards/rule_reward": 0.02734375, "step": 1149, "token_diversity": 0.33396084337349397 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6971809639284632, "grad_norm": 1.7074253559112549, "kl": 37.375, "learning_rate": 7.571894682229776e-06, "loss": 0.0374, "reward": 0.0035382749047130346, "reward_std": 0.09934337437152863, "rewards/ndcg_rule_reward": -0.01989922486245632, "rewards/rule_reward": 0.0234375, "step": 1150, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6977872082449227, "grad_norm": 4.045222282409668, "kl": 14.46875, "learning_rate": 7.567685206757873e-06, "loss": 0.0144, "reward": 0.0022159484797157347, "reward_std": 0.08315413445234299, "rewards/ndcg_rule_reward": -0.017315302044153214, "rewards/rule_reward": 0.01953125, "step": 1151, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6983934525613822, "grad_norm": 1.3753670454025269, "kl": 25.5, "learning_rate": 7.563473258023877e-06, "loss": 0.0256, "reward": 0.003192955977283418, "reward_std": 0.09951380267739296, "rewards/ndcg_rule_reward": -0.020244544371962547, "rewards/rule_reward": 0.0234375, "step": 1152, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6989996968778418, "grad_norm": 2.3530447483062744, "kl": 23.75, "learning_rate": 7.559258840084848e-06, "loss": 0.0238, "reward": 0.0020015458576381207, "reward_std": 0.06639154255390167, "rewards/ndcg_rule_reward": -0.01362345414236188, "rewards/rule_reward": 0.015625, "step": 1153, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.6996059411943013, "grad_norm": 1.7975577116012573, "kl": 20.25, "learning_rate": 7.555041957000223e-06, "loss": 0.0203, "reward": 0.0032243456225842237, "reward_std": 0.09947482869029045, "rewards/ndcg_rule_reward": -0.020213154144585133, "rewards/rule_reward": 0.0234375, "step": 1154, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7002121855107608, "grad_norm": 1.777156949043274, "kl": 46.25, "learning_rate": 7.550822612831819e-06, "loss": 0.0462, "reward": 0.0037526197265833616, "reward_std": 0.10764295980334282, "rewards/ndcg_rule_reward": -0.021638005506247282, "rewards/rule_reward": 0.025390625, "step": 1155, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7008184298272204, "grad_norm": 3.51204776763916, "kl": 33.25, "learning_rate": 7.546600811643816e-06, "loss": 0.0333, "reward": 0.003173461416736245, "reward_std": 0.10792272537946701, "rewards/ndcg_rule_reward": -0.0222171638160944, "rewards/rule_reward": 0.025390625, "step": 1156, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7014246741436799, "grad_norm": 2.804018974304199, "kl": 52.375, "learning_rate": 7.542376557502764e-06, "loss": 0.0524, "reward": 0.004819802241399884, "reward_std": 0.14080116897821426, "rewards/ndcg_rule_reward": -0.02838332299143076, "rewards/rule_reward": 0.033203125, "step": 1157, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7020309184601394, "grad_norm": 1.5506559610366821, "kl": 26.6875, "learning_rate": 7.538149854477579e-06, "loss": 0.0267, "reward": 0.0031527040991932154, "reward_std": 0.09110091254115105, "rewards/ndcg_rule_reward": -0.01833167113363743, "rewards/rule_reward": 0.021484375, "step": 1158, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7026371627765989, "grad_norm": 2.4314422607421875, "kl": 39.0, "learning_rate": 7.533920706639531e-06, "loss": 0.039, "reward": 0.00376433995552361, "reward_std": 0.12445865944027901, "rewards/ndcg_rule_reward": -0.025532535277307034, "rewards/rule_reward": 0.029296875, "step": 1159, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7032434070930585, "grad_norm": 1.2489441633224487, "kl": 24.5, "learning_rate": 7.5296891180622466e-06, "loss": 0.0245, "reward": 0.0029625900788232684, "reward_std": 0.08278887532651424, "rewards/ndcg_rule_reward": -0.016568659571930766, "rewards/rule_reward": 0.01953125, "step": 1160, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.703849651409518, "grad_norm": 4.84531831741333, "kl": 31.25, "learning_rate": 7.525455092821703e-06, "loss": 0.0312, "reward": 0.0031225252896547318, "reward_std": 0.09957098960876465, "rewards/ndcg_rule_reward": -0.020314975641667843, "rewards/rule_reward": 0.0234375, "step": 1161, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7044558957259776, "grad_norm": 1.6549159288406372, "kl": 26.1875, "learning_rate": 7.521218634996226e-06, "loss": 0.0262, "reward": 0.0036726780235767365, "reward_std": 0.11609014868736267, "rewards/ndcg_rule_reward": -0.023671071976423264, "rewards/rule_reward": 0.02734375, "step": 1162, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7050621400424371, "grad_norm": 3.9493062496185303, "kl": 71.125, "learning_rate": 7.5169797486664845e-06, "loss": 0.0712, "reward": 0.002991860150359571, "reward_std": 0.11643088981509209, "rewards/ndcg_rule_reward": -0.02435188926756382, "rewards/rule_reward": 0.02734375, "step": 1163, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7056683843588967, "grad_norm": 2.257228136062622, "kl": 32.40625, "learning_rate": 7.512738437915482e-06, "loss": 0.0324, "reward": 0.0017833317979238927, "reward_std": 0.08331675827503204, "rewards/ndcg_rule_reward": -0.017747918143868446, "rewards/rule_reward": 0.01953125, "step": 1164, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7062746286753562, "grad_norm": 1.874545693397522, "kl": 23.5, "learning_rate": 7.508494706828564e-06, "loss": 0.0235, "reward": 0.0018868478946387768, "reward_std": 0.06642763689160347, "rewards/ndcg_rule_reward": -0.01373815257102251, "rewards/rule_reward": 0.015625, "step": 1165, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7068808729918157, "grad_norm": 1.6855008602142334, "kl": 8.90625, "learning_rate": 7.504248559493403e-06, "loss": 0.0089, "reward": 0.001265684375539422, "reward_std": 0.04989112168550491, "rewards/ndcg_rule_reward": -0.010453065857291222, "rewards/rule_reward": 0.01171875, "step": 1166, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7074871173082752, "grad_norm": 1.574112892150879, "kl": 30.375, "learning_rate": 7.500000000000001e-06, "loss": 0.0303, "reward": 0.0034135469468310475, "reward_std": 0.09938304871320724, "rewards/ndcg_rule_reward": -0.020023953169584274, "rewards/rule_reward": 0.0234375, "step": 1167, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 6.2734375, "epoch": 0.7080933616247348, "grad_norm": 1.6939905881881714, "kl": 31.3125, "learning_rate": 7.495749032440681e-06, "loss": 0.0314, "reward": 0.003825573017820716, "reward_std": 0.11601601541042328, "rewards/ndcg_rule_reward": -0.023518177680671215, "rewards/rule_reward": 0.02734375, "step": 1168, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.7086996059411943, "grad_norm": 2.645975351333618, "kl": 19.96875, "learning_rate": 7.491495660910088e-06, "loss": 0.02, "reward": 0.003251841408200562, "reward_std": 0.1163281686604023, "rewards/ndcg_rule_reward": -0.024091909639537334, "rewards/rule_reward": 0.02734375, "step": 1169, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7093058502576538, "grad_norm": 3.406238079071045, "kl": 51.625, "learning_rate": 7.487239889505181e-06, "loss": 0.0517, "reward": 0.0055340786930173635, "reward_std": 0.14887920394539833, "rewards/ndcg_rule_reward": -0.029622171074151993, "rewards/rule_reward": 0.03515625, "step": 1170, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7099120945741134, "grad_norm": 1.0385234355926514, "kl": 21.65625, "learning_rate": 7.4829817223252324e-06, "loss": 0.0217, "reward": 0.002473140135407448, "reward_std": 0.07458123750984669, "rewards/ndcg_rule_reward": -0.015104984631761909, "rewards/rule_reward": 0.017578125, "step": 1171, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7105183388905729, "grad_norm": 1.9737292528152466, "kl": 29.53125, "learning_rate": 7.47872116347182e-06, "loss": 0.0295, "reward": 0.003995839040726423, "reward_std": 0.13278842344880104, "rewards/ndcg_rule_reward": -0.027254161424934864, "rewards/rule_reward": 0.03125, "step": 1172, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7111245832070324, "grad_norm": 2.2054402828216553, "kl": 23.75, "learning_rate": 7.474458217048827e-06, "loss": 0.0238, "reward": 0.002729181433096528, "reward_std": 0.09972382709383965, "rewards/ndcg_rule_reward": -0.020708318799734116, "rewards/rule_reward": 0.0234375, "step": 1173, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7117308275234919, "grad_norm": 1.8011808395385742, "kl": 29.25, "learning_rate": 7.470192887162435e-06, "loss": 0.0293, "reward": 0.003522729384712875, "reward_std": 0.11620540171861649, "rewards/ndcg_rule_reward": -0.023821020498871803, "rewards/rule_reward": 0.02734375, "step": 1174, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7123370718399515, "grad_norm": 1.8244696855545044, "kl": 13.8125, "learning_rate": 7.465925177921124e-06, "loss": 0.0138, "reward": 0.0022756357793696225, "reward_std": 0.0831051617860794, "rewards/ndcg_rule_reward": -0.017255613580346107, "rewards/rule_reward": 0.01953125, "step": 1175, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 8.41796875, "epoch": 0.712943316156411, "grad_norm": 2.9115684032440186, "kl": 23.125, "learning_rate": 7.461655093435661e-06, "loss": 0.0231, "reward": 0.00313850783277303, "reward_std": 0.10795547813177109, "rewards/ndcg_rule_reward": -0.022252117283642292, "rewards/rule_reward": 0.025390625, "step": 1176, "token_diversity": 0.34796779141104295 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7135495604728705, "grad_norm": 3.042701005935669, "kl": 38.125, "learning_rate": 7.4573826378191075e-06, "loss": 0.0381, "reward": 0.003220940358005464, "reward_std": 0.1163283959031105, "rewards/ndcg_rule_reward": -0.0241228099912405, "rewards/rule_reward": 0.02734375, "step": 1177, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.71415580478933, "grad_norm": 2.412492513656616, "kl": 30.1875, "learning_rate": 7.453107815186803e-06, "loss": 0.0301, "reward": 0.002531065372750163, "reward_std": 0.09978542476892471, "rewards/ndcg_rule_reward": -0.020906435325741768, "rewards/rule_reward": 0.0234375, "step": 1178, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7147620491057897, "grad_norm": 2.3192718029022217, "kl": 32.0, "learning_rate": 7.448830629656371e-06, "loss": 0.032, "reward": 0.004799584159627557, "reward_std": 0.1576642170548439, "rewards/ndcg_rule_reward": -0.03230979107320309, "rewards/rule_reward": 0.037109375, "step": 1179, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7153682934222492, "grad_norm": 1.8696331977844238, "kl": 26.3125, "learning_rate": 7.444551085347708e-06, "loss": 0.0262, "reward": 0.0025602540699765086, "reward_std": 0.09980897605419159, "rewards/ndcg_rule_reward": -0.020877246744930744, "rewards/rule_reward": 0.0234375, "step": 1180, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7159745377387087, "grad_norm": 2.1143014430999756, "kl": 24.20703125, "learning_rate": 7.440269186382982e-06, "loss": 0.0242, "reward": 0.002838522312231362, "reward_std": 0.09126309491693974, "rewards/ndcg_rule_reward": -0.01864585280418396, "rewards/rule_reward": 0.021484375, "step": 1181, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7165807820551683, "grad_norm": 1.312723994255066, "kl": 39.625, "learning_rate": 7.435984936886637e-06, "loss": 0.0397, "reward": 0.004520059330388904, "reward_std": 0.12413249537348747, "rewards/ndcg_rule_reward": -0.02477681590244174, "rewards/rule_reward": 0.029296875, "step": 1182, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.7171870263716278, "grad_norm": 1.4259963035583496, "kl": 18.40625, "learning_rate": 7.431698340985369e-06, "loss": 0.0184, "reward": 0.0024301065132021904, "reward_std": 0.09148604795336723, "rewards/ndcg_rule_reward": -0.01905426848679781, "rewards/rule_reward": 0.021484375, "step": 1183, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7177932706880873, "grad_norm": 3.3277440071105957, "kl": 33.4375, "learning_rate": 7.427409402808144e-06, "loss": 0.0335, "reward": 0.0032542720437049866, "reward_std": 0.0994684025645256, "rewards/ndcg_rule_reward": -0.020183227956295013, "rewards/rule_reward": 0.0234375, "step": 1184, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.7183995150045468, "grad_norm": 1.5282647609710693, "kl": 24.6875, "learning_rate": 7.423118126486178e-06, "loss": 0.0247, "reward": 0.002245562616735697, "reward_std": 0.07468057796359062, "rewards/ndcg_rule_reward": -0.015332562383264303, "rewards/rule_reward": 0.017578125, "step": 1185, "token_diversity": 0.546875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7190057593210064, "grad_norm": 1.6849454641342163, "kl": 19.875, "learning_rate": 7.418824516152942e-06, "loss": 0.0199, "reward": 0.0026289138477295637, "reward_std": 0.09135998040437698, "rewards/ndcg_rule_reward": -0.018855460919439793, "rewards/rule_reward": 0.021484375, "step": 1186, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7196120036374659, "grad_norm": 1.8581608533859253, "kl": 32.6875, "learning_rate": 7.414528575944159e-06, "loss": 0.0326, "reward": 0.002917711390182376, "reward_std": 0.10803350806236267, "rewards/ndcg_rule_reward": -0.022472914308309555, "rewards/rule_reward": 0.025390625, "step": 1187, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.7202182479539254, "grad_norm": 2.8685572147369385, "kl": 20.3125, "learning_rate": 7.410230309997785e-06, "loss": 0.0203, "reward": 0.0029947427101433277, "reward_std": 0.116402268409729, "rewards/ndcg_rule_reward": -0.02434900775551796, "rewards/rule_reward": 0.02734375, "step": 1188, "token_diversity": 0.33443509615384615 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7208244922703849, "grad_norm": 6.332581043243408, "kl": 34.0625, "learning_rate": 7.405929722454026e-06, "loss": 0.0341, "reward": 0.0037144337547942996, "reward_std": 0.14133408665657043, "rewards/ndcg_rule_reward": -0.02948869112879038, "rewards/rule_reward": 0.033203125, "step": 1189, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7214307365868445, "grad_norm": 79.09623718261719, "kl": 392.0, "learning_rate": 7.4016268174553215e-06, "loss": 0.3915, "reward": 0.0034437450231052935, "reward_std": 0.10780192166566849, "rewards/ndcg_rule_reward": -0.021946880035102367, "rewards/rule_reward": 0.025390625, "step": 1190, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.722036980903304, "grad_norm": 2.0435965061187744, "kl": 53.75, "learning_rate": 7.397321599146343e-06, "loss": 0.0537, "reward": 0.005137624451890588, "reward_std": 0.1406705304980278, "rewards/ndcg_rule_reward": -0.02806549984961748, "rewards/rule_reward": 0.033203125, "step": 1191, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7226432252197635, "grad_norm": 2.682502508163452, "kl": 38.875, "learning_rate": 7.393014071673992e-06, "loss": 0.039, "reward": 0.004356009885668755, "reward_std": 0.13264638930559158, "rewards/ndcg_rule_reward": -0.02689399104565382, "rewards/rule_reward": 0.03125, "step": 1192, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.723249469536223, "grad_norm": 3.730746269226074, "kl": 12.3515625, "learning_rate": 7.38870423918739e-06, "loss": 0.0124, "reward": 0.0021701649529859424, "reward_std": 0.09156742319464684, "rewards/ndcg_rule_reward": -0.019314209930598736, "rewards/rule_reward": 0.021484375, "step": 1193, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7238557138526827, "grad_norm": 1.4838711023330688, "kl": 18.515625, "learning_rate": 7.384392105837881e-06, "loss": 0.0185, "reward": 0.002554844773840159, "reward_std": 0.07455357909202576, "rewards/ndcg_rule_reward": -0.015023280284367502, "rewards/rule_reward": 0.017578125, "step": 1194, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.7244619581691422, "grad_norm": 4.330732345581055, "kl": 59.875, "learning_rate": 7.3800776757790275e-06, "loss": 0.0598, "reward": 0.005610218970105052, "reward_std": 0.15724997222423553, "rewards/ndcg_rule_reward": -0.03149915672838688, "rewards/rule_reward": 0.037109375, "step": 1195, "token_diversity": 0.29513459158415845 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7250682024856017, "grad_norm": 1.7582303285598755, "kl": 17.90625, "learning_rate": 7.375760953166601e-06, "loss": 0.0179, "reward": 0.0031729331240057945, "reward_std": 0.10794638842344284, "rewards/ndcg_rule_reward": -0.02221769280731678, "rewards/rule_reward": 0.025390625, "step": 1196, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7256744468020613, "grad_norm": 1.2791588306427002, "kl": 28.125, "learning_rate": 7.371441942158583e-06, "loss": 0.0281, "reward": 0.0029341968474909663, "reward_std": 0.09959162026643753, "rewards/ndcg_rule_reward": -0.020503303967416286, "rewards/rule_reward": 0.0234375, "step": 1197, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7262806911185208, "grad_norm": 4.1875200271606445, "kl": 73.75, "learning_rate": 7.3671206469151605e-06, "loss": 0.0738, "reward": 0.0037906577344983816, "reward_std": 0.13288097456097603, "rewards/ndcg_rule_reward": -0.027459342032670975, "rewards/rule_reward": 0.03125, "step": 1198, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7268869354349803, "grad_norm": 2.0509774684906006, "kl": 49.125, "learning_rate": 7.362797071598715e-06, "loss": 0.0491, "reward": 0.004814371233806014, "reward_std": 0.1408020779490471, "rewards/ndcg_rule_reward": -0.028388754464685917, "rewards/rule_reward": 0.033203125, "step": 1199, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7274931797514398, "grad_norm": 2.2440271377563477, "kl": 24.9375, "learning_rate": 7.358471220373831e-06, "loss": 0.0249, "reward": 0.003253636648878455, "reward_std": 0.12471484392881393, "rewards/ndcg_rule_reward": -0.026043239049613476, "rewards/rule_reward": 0.029296875, "step": 1200, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7280994240678994, "grad_norm": 1.3646348714828491, "kl": 37.25, "learning_rate": 7.354143097407283e-06, "loss": 0.0373, "reward": 0.0029207096667960286, "reward_std": 0.0828053466975689, "rewards/ndcg_rule_reward": -0.016610539983958006, "rewards/rule_reward": 0.01953125, "step": 1201, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7287056683843589, "grad_norm": 1.917989730834961, "kl": 25.75, "learning_rate": 7.349812706868031e-06, "loss": 0.0257, "reward": 0.0033531435765326023, "reward_std": 0.10783226042985916, "rewards/ndcg_rule_reward": -0.02203748095780611, "rewards/rule_reward": 0.025390625, "step": 1202, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7293119127008184, "grad_norm": 1.8602067232131958, "kl": 37.5625, "learning_rate": 7.345480052927223e-06, "loss": 0.0376, "reward": 0.0034897399600595236, "reward_std": 0.0993553176522255, "rewards/ndcg_rule_reward": -0.019947759807109833, "rewards/rule_reward": 0.0234375, "step": 1203, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7299181570172779, "grad_norm": 1.2580435276031494, "kl": 28.6875, "learning_rate": 7.341145139758185e-06, "loss": 0.0287, "reward": 0.0029952360782772303, "reward_std": 0.09960637614130974, "rewards/ndcg_rule_reward": -0.0204422646202147, "rewards/rule_reward": 0.0234375, "step": 1204, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7305244013337375, "grad_norm": 3.725395441055298, "kl": 35.875, "learning_rate": 7.33680797153642e-06, "loss": 0.0358, "reward": 0.00445230002515018, "reward_std": 0.13258032500743866, "rewards/ndcg_rule_reward": -0.026797698810696602, "rewards/rule_reward": 0.03125, "step": 1205, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 8.603515625, "epoch": 0.731130645650197, "grad_norm": 4.168271064758301, "kl": 45.625, "learning_rate": 7.332468552439603e-06, "loss": 0.0455, "reward": 0.004294002428650856, "reward_std": 0.13263314589858055, "rewards/ndcg_rule_reward": -0.026955997571349144, "rewards/rule_reward": 0.03125, "step": 1206, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7317368899666565, "grad_norm": 6.867544174194336, "kl": 54.6875, "learning_rate": 7.328126886647575e-06, "loss": 0.0546, "reward": 0.0032696144189685583, "reward_std": 0.11628378182649612, "rewards/ndcg_rule_reward": -0.024074136279523373, "rewards/rule_reward": 0.02734375, "step": 1207, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 7.6484375, "epoch": 0.7323431342831161, "grad_norm": 2.1478445529937744, "kl": 33.25, "learning_rate": 7.323782978342345e-06, "loss": 0.0332, "reward": 0.0030840509571135044, "reward_std": 0.09954767674207687, "rewards/ndcg_rule_reward": -0.020353449508547783, "rewards/rule_reward": 0.0234375, "step": 1208, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.7329493785995757, "grad_norm": 1.524167776107788, "kl": 42.0, "learning_rate": 7.31943683170808e-06, "loss": 0.042, "reward": 0.004702065605670214, "reward_std": 0.14926859736442566, "rewards/ndcg_rule_reward": -0.0304541839286685, "rewards/rule_reward": 0.03515625, "step": 1209, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7335556229160352, "grad_norm": 1.7320271730422974, "kl": 20.875, "learning_rate": 7.3150884509311025e-06, "loss": 0.0209, "reward": 0.0033280912321060896, "reward_std": 0.11626725271344185, "rewards/ndcg_rule_reward": -0.024015658535063267, "rewards/rule_reward": 0.02734375, "step": 1210, "token_diversity": 0.546875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7341618672324947, "grad_norm": 1.8610020875930786, "kl": 35.5, "learning_rate": 7.310737840199886e-06, "loss": 0.0355, "reward": 0.004203436314128339, "reward_std": 0.13269563764333725, "rewards/ndcg_rule_reward": -0.027046565897762775, "rewards/rule_reward": 0.03125, "step": 1211, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.7347681115489543, "grad_norm": 1.6932421922683716, "kl": 18.4375, "learning_rate": 7.306385003705056e-06, "loss": 0.0184, "reward": 0.0025604944676160812, "reward_std": 0.09980620071291924, "rewards/ndcg_rule_reward": -0.02087700553238392, "rewards/rule_reward": 0.0234375, "step": 1212, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7353743558654138, "grad_norm": 1.7034614086151123, "kl": 31.6875, "learning_rate": 7.302029945639377e-06, "loss": 0.0316, "reward": 0.0037933874409645796, "reward_std": 0.13288352638483047, "rewards/ndcg_rule_reward": -0.027456612326204777, "rewards/rule_reward": 0.03125, "step": 1213, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7359806001818733, "grad_norm": 1.3866043090820312, "kl": 39.75, "learning_rate": 7.297672670197757e-06, "loss": 0.0398, "reward": 0.003994259401224554, "reward_std": 0.11596449092030525, "rewards/ndcg_rule_reward": -0.023349490948021412, "rewards/rule_reward": 0.02734375, "step": 1214, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7365868444983328, "grad_norm": 3.414172887802124, "kl": 48.6875, "learning_rate": 7.29331318157724e-06, "loss": 0.0488, "reward": 0.0029996454250067472, "reward_std": 0.11643492802977562, "rewards/ndcg_rule_reward": -0.024344105273485184, "rewards/rule_reward": 0.02734375, "step": 1215, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7371930888147924, "grad_norm": 1.6262938976287842, "kl": 33.875, "learning_rate": 7.288951483976998e-06, "loss": 0.0339, "reward": 0.0029332051053643227, "reward_std": 0.09119381010532379, "rewards/ndcg_rule_reward": -0.018551169894635677, "rewards/rule_reward": 0.021484375, "step": 1216, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.7377993331312519, "grad_norm": 2.0552711486816406, "kl": 45.625, "learning_rate": 7.284587581598335e-06, "loss": 0.0456, "reward": 0.0024297263007611036, "reward_std": 0.07459188997745514, "rewards/ndcg_rule_reward": -0.015148399397730827, "rewards/rule_reward": 0.017578125, "step": 1217, "token_diversity": 0.5102306547619048 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7384055774477114, "grad_norm": 1.9486573934555054, "kl": 37.0, "learning_rate": 7.280221478644675e-06, "loss": 0.0369, "reward": 0.004484647884964943, "reward_std": 0.14939861744642258, "rewards/ndcg_rule_reward": -0.030671601183712482, "rewards/rule_reward": 0.03515625, "step": 1218, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7390118217641709, "grad_norm": 2.6341779232025146, "kl": 19.375, "learning_rate": 7.275853179321565e-06, "loss": 0.0194, "reward": 0.002634102536831051, "reward_std": 0.09132120013237, "rewards/ndcg_rule_reward": -0.01885027252137661, "rewards/rule_reward": 0.021484375, "step": 1219, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7396180660806305, "grad_norm": 1.5857986211776733, "kl": 38.125, "learning_rate": 7.271482687836665e-06, "loss": 0.0382, "reward": 0.0038343241903930902, "reward_std": 0.10761530324816704, "rewards/ndcg_rule_reward": -0.021556300576776266, "rewards/rule_reward": 0.025390625, "step": 1220, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.74022431039709, "grad_norm": 2.723867177963257, "kl": 70.75, "learning_rate": 7.267110008399747e-06, "loss": 0.0707, "reward": 0.0064807378221303225, "reward_std": 0.19053547829389572, "rewards/ndcg_rule_reward": -0.038441138342022896, "rewards/rule_reward": 0.044921875, "step": 1221, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7408305547135495, "grad_norm": 1.5337096452713013, "kl": 22.6875, "learning_rate": 7.262735145222696e-06, "loss": 0.0227, "reward": 0.002429473795928061, "reward_std": 0.08305554836988449, "rewards/ndcg_rule_reward": -0.017101776786148548, "rewards/rule_reward": 0.01953125, "step": 1222, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7414367990300091, "grad_norm": 1.4886530637741089, "kl": 37.125, "learning_rate": 7.258358102519488e-06, "loss": 0.0371, "reward": 0.003109194803982973, "reward_std": 0.09111380204558372, "rewards/ndcg_rule_reward": -0.018375180661678314, "rewards/rule_reward": 0.021484375, "step": 1223, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7420430433464686, "grad_norm": 1.5978186130523682, "kl": 29.4375, "learning_rate": 7.253978884506212e-06, "loss": 0.0294, "reward": 0.003764694673009217, "reward_std": 0.13289574533700943, "rewards/ndcg_rule_reward": -0.027485307306051254, "rewards/rule_reward": 0.03125, "step": 1224, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7426492876629281, "grad_norm": 2.0064351558685303, "kl": 36.25, "learning_rate": 7.249597495401044e-06, "loss": 0.0361, "reward": 0.0037888975348323584, "reward_std": 0.13289334252476692, "rewards/ndcg_rule_reward": -0.027461102232336998, "rewards/rule_reward": 0.03125, "step": 1225, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7432555319793877, "grad_norm": 1.5921814441680908, "kl": 46.5625, "learning_rate": 7.245213939424253e-06, "loss": 0.0466, "reward": 0.004118787124752998, "reward_std": 0.12428683787584305, "rewards/ndcg_rule_reward": -0.02517808834090829, "rewards/rule_reward": 0.029296875, "step": 1226, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 6.67578125, "epoch": 0.7438617762958473, "grad_norm": 3.2653753757476807, "kl": 39.125, "learning_rate": 7.240828220798196e-06, "loss": 0.039, "reward": 0.003342147567309439, "reward_std": 0.10787119716405869, "rewards/ndcg_rule_reward": -0.02204847801476717, "rewards/rule_reward": 0.025390625, "step": 1227, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7444680206123068, "grad_norm": 10.145501136779785, "kl": 127.625, "learning_rate": 7.236440343747313e-06, "loss": 0.128, "reward": 0.005336751928552985, "reward_std": 0.16579975187778473, "rewards/ndcg_rule_reward": -0.03372574783861637, "rewards/rule_reward": 0.0390625, "step": 1228, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7450742649287663, "grad_norm": 4.125866889953613, "kl": 33.0625, "learning_rate": 7.232050312498122e-06, "loss": 0.0331, "reward": 0.004078551195561886, "reward_std": 0.16641463339328766, "rewards/ndcg_rule_reward": -0.03498394973576069, "rewards/rule_reward": 0.0390625, "step": 1229, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7456805092452258, "grad_norm": 1.3732892274856567, "kl": 20.5, "learning_rate": 7.22765813127922e-06, "loss": 0.0205, "reward": 0.0028602478560060263, "reward_std": 0.09122100844979286, "rewards/ndcg_rule_reward": -0.01862412877380848, "rewards/rule_reward": 0.021484375, "step": 1230, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7462867535616854, "grad_norm": 1.6433665752410889, "kl": 51.5, "learning_rate": 7.223263804321269e-06, "loss": 0.0516, "reward": 0.0043740468099713326, "reward_std": 0.1410050243139267, "rewards/ndcg_rule_reward": -0.028829077258706093, "rewards/rule_reward": 0.033203125, "step": 1231, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7468929978781449, "grad_norm": 1.6821329593658447, "kl": 32.75, "learning_rate": 7.218867335857e-06, "loss": 0.0327, "reward": 0.0029628428164869547, "reward_std": 0.08275815099477768, "rewards/ndcg_rule_reward": -0.016568407882004976, "rewards/rule_reward": 0.01953125, "step": 1232, "token_diversity": 0.35546875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7474992421946044, "grad_norm": 1.6399428844451904, "kl": 43.625, "learning_rate": 7.214468730121209e-06, "loss": 0.0436, "reward": 0.0046459531877189875, "reward_std": 0.1324853152036667, "rewards/ndcg_rule_reward": -0.02660404611378908, "rewards/rule_reward": 0.03125, "step": 1233, "token_diversity": 0.41497672872340424 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.748105486511064, "grad_norm": 71.58999633789062, "kl": 374.0, "learning_rate": 7.21006799135075e-06, "loss": 0.3737, "reward": 0.005403747316449881, "reward_std": 0.14894887804985046, "rewards/ndcg_rule_reward": -0.029752501286566257, "rewards/rule_reward": 0.03515625, "step": 1234, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7487117308275235, "grad_norm": 2.2648608684539795, "kl": 27.65625, "learning_rate": 7.205665123784528e-06, "loss": 0.0276, "reward": 0.004284134833142161, "reward_std": 0.14948948472738266, "rewards/ndcg_rule_reward": -0.03087211586534977, "rewards/rule_reward": 0.03515625, "step": 1235, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.749317975143983, "grad_norm": 2.0374250411987305, "kl": 40.6875, "learning_rate": 7.2012601316635015e-06, "loss": 0.0407, "reward": 0.00403518439270556, "reward_std": 0.13274363055825233, "rewards/ndcg_rule_reward": -0.027214815840125084, "rewards/rule_reward": 0.03125, "step": 1236, "token_diversity": 0.4534350198412698 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7499242194604425, "grad_norm": 2.425870418548584, "kl": 23.0, "learning_rate": 7.196853019230676e-06, "loss": 0.023, "reward": 0.003376383800059557, "reward_std": 0.1162169948220253, "rewards/ndcg_rule_reward": -0.02396736666560173, "rewards/rule_reward": 0.02734375, "step": 1237, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7505304637769021, "grad_norm": 16.195812225341797, "kl": 128.25, "learning_rate": 7.1924437907310985e-06, "loss": 0.1281, "reward": 0.0031578148482367396, "reward_std": 0.11635347455739975, "rewards/ndcg_rule_reward": -0.024185934104025364, "rewards/rule_reward": 0.02734375, "step": 1238, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7511367080933616, "grad_norm": 1.705288052558899, "kl": 33.375, "learning_rate": 7.188032450411855e-06, "loss": 0.0334, "reward": 0.0037618440110236406, "reward_std": 0.12449578568339348, "rewards/ndcg_rule_reward": -0.025535031221807003, "rewards/rule_reward": 0.029296875, "step": 1239, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7517429524098211, "grad_norm": 2.136085033416748, "kl": 26.0625, "learning_rate": 7.183619002522062e-06, "loss": 0.0261, "reward": 0.0034559504128992558, "reward_std": 0.1162213571369648, "rewards/ndcg_rule_reward": -0.023887800984084606, "rewards/rule_reward": 0.02734375, "step": 1240, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7523491967262806, "grad_norm": 2.2931408882141113, "kl": 23.375, "learning_rate": 7.179203451312871e-06, "loss": 0.0234, "reward": 0.0041209718910977244, "reward_std": 0.14114483073353767, "rewards/ndcg_rule_reward": -0.029082152992486954, "rewards/rule_reward": 0.033203125, "step": 1241, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7529554410427403, "grad_norm": 2.566065549850464, "kl": 27.875, "learning_rate": 7.1747858010374574e-06, "loss": 0.0278, "reward": 0.0036319897044450045, "reward_std": 0.12453650683164597, "rewards/ndcg_rule_reward": -0.025664886459708214, "rewards/rule_reward": 0.029296875, "step": 1242, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7535616853591998, "grad_norm": 2.221940755844116, "kl": 26.9375, "learning_rate": 7.170366055951017e-06, "loss": 0.0269, "reward": 0.003135676379315555, "reward_std": 0.10793154314160347, "rewards/ndcg_rule_reward": -0.022254948504269123, "rewards/rule_reward": 0.025390625, "step": 1243, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7541679296756593, "grad_norm": 1.8133807182312012, "kl": 5.703125, "learning_rate": 7.165944220310766e-06, "loss": 0.0057, "reward": 0.0025826433557085693, "reward_std": 0.0997995063662529, "rewards/ndcg_rule_reward": -0.020854856353253126, "rewards/rule_reward": 0.0234375, "step": 1244, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7547741739921188, "grad_norm": 1.8955837488174438, "kl": 35.1875, "learning_rate": 7.161520298375933e-06, "loss": 0.0352, "reward": 0.004563010414130986, "reward_std": 0.14094368368387222, "rewards/ndcg_rule_reward": -0.02864011563360691, "rewards/rule_reward": 0.033203125, "step": 1245, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7553804183085784, "grad_norm": 1.295020341873169, "kl": 32.125, "learning_rate": 7.157094294407757e-06, "loss": 0.0322, "reward": 0.0023839527857489884, "reward_std": 0.0830310508608818, "rewards/ndcg_rule_reward": -0.01714729703962803, "rewards/rule_reward": 0.01953125, "step": 1246, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7559866626250379, "grad_norm": 2.259141683578491, "kl": 16.5625, "learning_rate": 7.152666212669479e-06, "loss": 0.0166, "reward": 0.003600334981456399, "reward_std": 0.1414104588329792, "rewards/ndcg_rule_reward": -0.02960279118269682, "rewards/rule_reward": 0.033203125, "step": 1247, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7565929069414974, "grad_norm": 3.8002235889434814, "kl": 35.625, "learning_rate": 7.148236057426347e-06, "loss": 0.0356, "reward": 0.003223616164177656, "reward_std": 0.11633477360010147, "rewards/ndcg_rule_reward": -0.024120133370161057, "rewards/rule_reward": 0.02734375, "step": 1248, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.757199151257957, "grad_norm": 1.512502670288086, "kl": 33.25, "learning_rate": 7.143803832945602e-06, "loss": 0.0332, "reward": 0.0034189863363280892, "reward_std": 0.11623745039105415, "rewards/ndcg_rule_reward": -0.023924763314425945, "rewards/rule_reward": 0.02734375, "step": 1249, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7578053955744165, "grad_norm": 3.32487154006958, "kl": 33.25, "learning_rate": 7.139369543496479e-06, "loss": 0.0333, "reward": 0.0025273457285948098, "reward_std": 0.07455955445766449, "rewards/ndcg_rule_reward": -0.015050779096782207, "rewards/rule_reward": 0.017578125, "step": 1250, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.758411639890876, "grad_norm": 2.5916078090667725, "kl": 47.5, "learning_rate": 7.134933193350205e-06, "loss": 0.0475, "reward": 0.003035179222933948, "reward_std": 0.10795729234814644, "rewards/ndcg_rule_reward": -0.022355444729328156, "rewards/rule_reward": 0.025390625, "step": 1251, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7590178842073355, "grad_norm": 1.3386595249176025, "kl": 24.15625, "learning_rate": 7.130494786779987e-06, "loss": 0.0241, "reward": 0.002482785319443792, "reward_std": 0.07460719533264637, "rewards/ndcg_rule_reward": -0.015095340088009834, "rewards/rule_reward": 0.017578125, "step": 1252, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7596241285237951, "grad_norm": 0.9770424962043762, "kl": 21.71875, "learning_rate": 7.126054328061014e-06, "loss": 0.0217, "reward": 0.0016918128821998835, "reward_std": 0.05812774412333965, "rewards/ndcg_rule_reward": -0.011980063049122691, "rewards/rule_reward": 0.013671875, "step": 1253, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7602303728402546, "grad_norm": 7.954211235046387, "kl": 113.6875, "learning_rate": 7.121611821470457e-06, "loss": 0.1137, "reward": 0.0037506227381527424, "reward_std": 0.1412886083126068, "rewards/ndcg_rule_reward": -0.02945250179618597, "rewards/rule_reward": 0.033203125, "step": 1254, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7608366171567141, "grad_norm": 1.8325444459915161, "kl": 26.9375, "learning_rate": 7.117167271287453e-06, "loss": 0.027, "reward": 0.003482308122329414, "reward_std": 0.11618403345346451, "rewards/ndcg_rule_reward": -0.02386144269257784, "rewards/rule_reward": 0.02734375, "step": 1255, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7614428614731736, "grad_norm": 1.8171331882476807, "kl": 40.3125, "learning_rate": 7.112720681793108e-06, "loss": 0.0402, "reward": 0.002522778057027608, "reward_std": 0.07456038519740105, "rewards/ndcg_rule_reward": -0.015055348165333271, "rewards/rule_reward": 0.017578125, "step": 1256, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7620491057896333, "grad_norm": 3.136122703552246, "kl": 45.5, "learning_rate": 7.108272057270497e-06, "loss": 0.0454, "reward": 0.0035805661464110017, "reward_std": 0.11618683487176895, "rewards/ndcg_rule_reward": -0.023763184435665607, "rewards/rule_reward": 0.02734375, "step": 1257, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7626553501060928, "grad_norm": 1.5159109830856323, "kl": 12.15625, "learning_rate": 7.103821402004654e-06, "loss": 0.0122, "reward": 0.0025615838821977377, "reward_std": 0.09137937426567078, "rewards/ndcg_rule_reward": -0.01892279088497162, "rewards/rule_reward": 0.021484375, "step": 1258, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.7632615944225523, "grad_norm": 2.8017375469207764, "kl": 21.6875, "learning_rate": 7.0993687202825645e-06, "loss": 0.0217, "reward": 0.0033783400431275368, "reward_std": 0.09942712262272835, "rewards/ndcg_rule_reward": -0.020059159956872463, "rewards/rule_reward": 0.0234375, "step": 1259, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7638678387390119, "grad_norm": 4.060194969177246, "kl": 26.3125, "learning_rate": 7.0949140163931695e-06, "loss": 0.0264, "reward": 0.0024033248191699386, "reward_std": 0.08303434401750565, "rewards/ndcg_rule_reward": -0.017127925530076027, "rewards/rule_reward": 0.01953125, "step": 1260, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7644740830554714, "grad_norm": 0.8737215995788574, "kl": 21.5, "learning_rate": 7.090457294627358e-06, "loss": 0.0215, "reward": 0.0026481845998205245, "reward_std": 0.07449174299836159, "rewards/ndcg_rule_reward": -0.014929940924048424, "rewards/rule_reward": 0.017578125, "step": 1261, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7650803273719309, "grad_norm": 1.841583251953125, "kl": 13.328125, "learning_rate": 7.0859985592779614e-06, "loss": 0.0133, "reward": 0.002600839827209711, "reward_std": 0.074539914727211, "rewards/ndcg_rule_reward": -0.014977286104112864, "rewards/rule_reward": 0.017578125, "step": 1262, "token_diversity": 0.3671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7656865716883904, "grad_norm": 2.500138759613037, "kl": 16.90625, "learning_rate": 7.08153781463975e-06, "loss": 0.0169, "reward": 0.0031355576356872916, "reward_std": 0.09953408315777779, "rewards/ndcg_rule_reward": -0.020301942713558674, "rewards/rule_reward": 0.0234375, "step": 1263, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.76629281600485, "grad_norm": 2.269230842590332, "kl": 29.75, "learning_rate": 7.0770750650094335e-06, "loss": 0.0297, "reward": 0.0037683192640542984, "reward_std": 0.14973078668117523, "rewards/ndcg_rule_reward": -0.03138792980462313, "rewards/rule_reward": 0.03515625, "step": 1264, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.7668990603213095, "grad_norm": 8.621294975280762, "kl": 56.375, "learning_rate": 7.0726103146856475e-06, "loss": 0.0564, "reward": 0.0038014164892956614, "reward_std": 0.11605199426412582, "rewards/ndcg_rule_reward": -0.023542333394289017, "rewards/rule_reward": 0.02734375, "step": 1265, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.767505304637769, "grad_norm": 2.270961284637451, "kl": 34.375, "learning_rate": 7.068143567968958e-06, "loss": 0.0343, "reward": 0.004056979552842677, "reward_std": 0.1327364519238472, "rewards/ndcg_rule_reward": -0.027193020097911358, "rewards/rule_reward": 0.03125, "step": 1266, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.7681115489542285, "grad_norm": 1.4258266687393188, "kl": 33.75, "learning_rate": 7.063674829161854e-06, "loss": 0.0337, "reward": 0.0028017524164170027, "reward_std": 0.10809917747974396, "rewards/ndcg_rule_reward": -0.02258887328207493, "rewards/rule_reward": 0.025390625, "step": 1267, "token_diversity": 0.28719565763052207 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.7687177932706881, "grad_norm": 1.8836771249771118, "kl": 28.8125, "learning_rate": 7.059204102568741e-06, "loss": 0.0288, "reward": 0.004190589184872806, "reward_std": 0.13268963247537613, "rewards/ndcg_rule_reward": -0.027059410698711872, "rewards/rule_reward": 0.03125, "step": 1268, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.7693240375871476, "grad_norm": 1.5040249824523926, "kl": 26.625, "learning_rate": 7.054731392495941e-06, "loss": 0.0266, "reward": 0.0027024076553061604, "reward_std": 0.08290740102529526, "rewards/ndcg_rule_reward": -0.01682884292677045, "rewards/rule_reward": 0.01953125, "step": 1269, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7699302819036071, "grad_norm": 2.219372034072876, "kl": 21.625, "learning_rate": 7.0502567032516885e-06, "loss": 0.0216, "reward": 0.002050985873211175, "reward_std": 0.08321094140410423, "rewards/ndcg_rule_reward": -0.01748026441782713, "rewards/rule_reward": 0.01953125, "step": 1270, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7705365262200667, "grad_norm": 2.0666158199310303, "kl": 27.1875, "learning_rate": 7.0457800391461204e-06, "loss": 0.0272, "reward": 0.0026092392508871853, "reward_std": 0.0997607670724392, "rewards/ndcg_rule_reward": -0.02082826104015112, "rewards/rule_reward": 0.0234375, "step": 1271, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7711427705365262, "grad_norm": 1.1051533222198486, "kl": 29.8125, "learning_rate": 7.041301404491278e-06, "loss": 0.0299, "reward": 0.003194584627635777, "reward_std": 0.09108445048332214, "rewards/ndcg_rule_reward": -0.01828979142010212, "rewards/rule_reward": 0.021484375, "step": 1272, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7717490148529857, "grad_norm": 2.086286783218384, "kl": 31.5, "learning_rate": 7.036820803601099e-06, "loss": 0.0314, "reward": 0.003389771212823689, "reward_std": 0.10782361403107643, "rewards/ndcg_rule_reward": -0.022000853903591633, "rewards/rule_reward": 0.025390625, "step": 1273, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7723552591694453, "grad_norm": 2.700798988342285, "kl": 32.625, "learning_rate": 7.032338240791419e-06, "loss": 0.0326, "reward": 0.0032162528950721025, "reward_std": 0.11629564687609673, "rewards/ndcg_rule_reward": -0.02412749733775854, "rewards/rule_reward": 0.02734375, "step": 1274, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7729615034859049, "grad_norm": 2.0016465187072754, "kl": 26.4375, "learning_rate": 7.027853720379956e-06, "loss": 0.0264, "reward": 0.002925471984781325, "reward_std": 0.09962820261716843, "rewards/ndcg_rule_reward": -0.020512028597295284, "rewards/rule_reward": 0.0234375, "step": 1275, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7735677478023644, "grad_norm": 7.044174671173096, "kl": 26.75, "learning_rate": 7.023367246686323e-06, "loss": 0.0268, "reward": 0.0025674868375062943, "reward_std": 0.0829482413828373, "rewards/ndcg_rule_reward": -0.01696376409381628, "rewards/rule_reward": 0.01953125, "step": 1276, "token_diversity": 0.3515625 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.7741739921188239, "grad_norm": 4.856385707855225, "kl": 56.625, "learning_rate": 7.0188788240320095e-06, "loss": 0.0567, "reward": 0.0025662736734375358, "reward_std": 0.09977268427610397, "rewards/ndcg_rule_reward": -0.02087122667580843, "rewards/rule_reward": 0.0234375, "step": 1277, "token_diversity": 0.4568452380952381 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7747802364352834, "grad_norm": 1.4776968955993652, "kl": 28.71875, "learning_rate": 7.014388456740379e-06, "loss": 0.0286, "reward": 0.0039481790736317635, "reward_std": 0.1328025907278061, "rewards/ndcg_rule_reward": -0.02730182185769081, "rewards/rule_reward": 0.03125, "step": 1278, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.775386480751743, "grad_norm": 1.871906042098999, "kl": 31.5625, "learning_rate": 7.009896149136675e-06, "loss": 0.0316, "reward": 0.004419811652041972, "reward_std": 0.12416727840900421, "rewards/ndcg_rule_reward": -0.024877062998712063, "rewards/rule_reward": 0.029296875, "step": 1279, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7759927250682025, "grad_norm": 2.2637693881988525, "kl": 54.75, "learning_rate": 7.005401905548006e-06, "loss": 0.0548, "reward": 0.004931694478727877, "reward_std": 0.14076795428991318, "rewards/ndcg_rule_reward": -0.028271430172026157, "rewards/rule_reward": 0.033203125, "step": 1280, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.776598969384662, "grad_norm": 1.4911680221557617, "kl": 40.875, "learning_rate": 7.000905730303343e-06, "loss": 0.0408, "reward": 0.0042361029190942645, "reward_std": 0.12425077706575394, "rewards/ndcg_rule_reward": -0.025060771964490414, "rewards/rule_reward": 0.029296875, "step": 1281, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7772052137011215, "grad_norm": 1.7513725757598877, "kl": 38.5, "learning_rate": 6.996407627733526e-06, "loss": 0.0384, "reward": 0.003220006125047803, "reward_std": 0.10790584981441498, "rewards/ndcg_rule_reward": -0.02217061910778284, "rewards/rule_reward": 0.025390625, "step": 1282, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.7778114580175811, "grad_norm": 1.5813038349151611, "kl": 29.0, "learning_rate": 6.991907602171241e-06, "loss": 0.029, "reward": 0.00290422304533422, "reward_std": 0.09124011918902397, "rewards/ndcg_rule_reward": -0.018580152187496424, "rewards/rule_reward": 0.021484375, "step": 1283, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7784177023340406, "grad_norm": 2.975426197052002, "kl": 31.21875, "learning_rate": 6.987405657951033e-06, "loss": 0.0312, "reward": 0.00273860024753958, "reward_std": 0.08289626240730286, "rewards/ndcg_rule_reward": -0.016792650800198317, "rewards/rule_reward": 0.01953125, "step": 1284, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.92578125, "epoch": 0.7790239466505001, "grad_norm": 2.394345760345459, "kl": 42.625, "learning_rate": 6.982901799409294e-06, "loss": 0.0425, "reward": 0.003397757885977626, "reward_std": 0.1078154593706131, "rewards/ndcg_rule_reward": -0.02199286688119173, "rewards/rule_reward": 0.025390625, "step": 1285, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7796301909669597, "grad_norm": 1.9816770553588867, "kl": 41.25, "learning_rate": 6.97839603088426e-06, "loss": 0.0412, "reward": 0.004318716935813427, "reward_std": 0.1326277144253254, "rewards/ndcg_rule_reward": -0.026931283995509148, "rewards/rule_reward": 0.03125, "step": 1286, "token_diversity": 0.44921875 }, { "categorical_diversity": 0.984375, "completion_length": 7.162109375, "epoch": 0.7802364352834192, "grad_norm": 1.6166188716888428, "kl": 29.5625, "learning_rate": 6.973888356716003e-06, "loss": 0.0295, "reward": 0.004401093116030097, "reward_std": 0.14102058857679367, "rewards/ndcg_rule_reward": -0.02880203165113926, "rewards/rule_reward": 0.033203125, "step": 1287, "token_diversity": 0.2664037379421222 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7808426795998787, "grad_norm": 1.5088844299316406, "kl": 38.125, "learning_rate": 6.969378781246436e-06, "loss": 0.0381, "reward": 0.002614942903164774, "reward_std": 0.0913645327091217, "rewards/ndcg_rule_reward": -0.018869432620704174, "rewards/rule_reward": 0.021484375, "step": 1288, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7814489239163382, "grad_norm": 2.894726037979126, "kl": 21.625, "learning_rate": 6.964867308819303e-06, "loss": 0.0216, "reward": 0.0022781476145610213, "reward_std": 0.09992209076881409, "rewards/ndcg_rule_reward": -0.021159352734684944, "rewards/rule_reward": 0.0234375, "step": 1289, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7820551682327979, "grad_norm": 1.1912498474121094, "kl": 21.21875, "learning_rate": 6.960353943780167e-06, "loss": 0.0213, "reward": 0.002390795387327671, "reward_std": 0.08306746929883957, "rewards/ndcg_rule_reward": -0.017140455543994904, "rewards/rule_reward": 0.01953125, "step": 1290, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7826614125492574, "grad_norm": 1.4413621425628662, "kl": 35.75, "learning_rate": 6.955838690476426e-06, "loss": 0.0357, "reward": 0.003087982186116278, "reward_std": 0.1079430803656578, "rewards/ndcg_rule_reward": -0.022302642464637756, "rewards/rule_reward": 0.025390625, "step": 1291, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.7832676568657169, "grad_norm": 2.5120160579681396, "kl": 27.9375, "learning_rate": 6.951321553257288e-06, "loss": 0.0279, "reward": 0.003518568875733763, "reward_std": 0.09091974049806595, "rewards/ndcg_rule_reward": -0.017965806648135185, "rewards/rule_reward": 0.021484375, "step": 1292, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7838739011821764, "grad_norm": 5.151121616363525, "kl": 46.375, "learning_rate": 6.946802536473781e-06, "loss": 0.0464, "reward": 0.004458528012037277, "reward_std": 0.13254785537719727, "rewards/ndcg_rule_reward": -0.026791471987962723, "rewards/rule_reward": 0.03125, "step": 1293, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.784480145498636, "grad_norm": 1.407698392868042, "kl": 24.0625, "learning_rate": 6.942281644478739e-06, "loss": 0.024, "reward": 0.003240579506382346, "reward_std": 0.09107077494263649, "rewards/ndcg_rule_reward": -0.018243796657770872, "rewards/rule_reward": 0.021484375, "step": 1294, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7850863898150955, "grad_norm": 2.677579641342163, "kl": 40.3125, "learning_rate": 6.937758881626804e-06, "loss": 0.0403, "reward": 0.0031252054031938314, "reward_std": 0.09953981637954712, "rewards/ndcg_rule_reward": -0.0203122952952981, "rewards/rule_reward": 0.0234375, "step": 1295, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.785692634131555, "grad_norm": 1.9339454174041748, "kl": 50.375, "learning_rate": 6.9332342522744235e-06, "loss": 0.0505, "reward": 0.0047664346639066935, "reward_std": 0.13241757079958916, "rewards/ndcg_rule_reward": -0.026483564637601376, "rewards/rule_reward": 0.03125, "step": 1296, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7862988784480146, "grad_norm": 0.9956029653549194, "kl": 19.625, "learning_rate": 6.928707760779838e-06, "loss": 0.0196, "reward": 0.001999186642933637, "reward_std": 0.057962577790021896, "rewards/ndcg_rule_reward": -0.01167268818244338, "rewards/rule_reward": 0.013671875, "step": 1297, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7869051227644741, "grad_norm": 1.3738821744918823, "kl": 26.625, "learning_rate": 6.924179411503081e-06, "loss": 0.0266, "reward": 0.0025273459032177925, "reward_std": 0.08299248665571213, "rewards/ndcg_rule_reward": -0.017003904096782207, "rewards/rule_reward": 0.01953125, "step": 1298, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7875113670809336, "grad_norm": 2.264019012451172, "kl": 46.25, "learning_rate": 6.919649208805982e-06, "loss": 0.0463, "reward": 0.003720384556800127, "reward_std": 0.09925245866179466, "rewards/ndcg_rule_reward": -0.01971711590886116, "rewards/rule_reward": 0.0234375, "step": 1299, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7881176113973931, "grad_norm": 3.0069830417633057, "kl": 22.78125, "learning_rate": 6.915117157052149e-06, "loss": 0.0228, "reward": 0.003938112175092101, "reward_std": 0.12437784671783447, "rewards/ndcg_rule_reward": -0.02535876352339983, "rewards/rule_reward": 0.029296875, "step": 1300, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7887238557138527, "grad_norm": 2.022649049758911, "kl": 29.8125, "learning_rate": 6.9105832606069756e-06, "loss": 0.0297, "reward": 0.0043132861610502005, "reward_std": 0.13262130320072174, "rewards/ndcg_rule_reward": -0.026936715468764305, "rewards/rule_reward": 0.03125, "step": 1301, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.7893301000303122, "grad_norm": 1.905207633972168, "kl": 25.9375, "learning_rate": 6.906047523837629e-06, "loss": 0.026, "reward": 0.0033957872074097395, "reward_std": 0.12463994324207306, "rewards/ndcg_rule_reward": -0.025901087559759617, "rewards/rule_reward": 0.029296875, "step": 1302, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7899363443467717, "grad_norm": 2.1499340534210205, "kl": 33.4375, "learning_rate": 6.90150995111305e-06, "loss": 0.0334, "reward": 0.0032146627781912684, "reward_std": 0.10790731757879257, "rewards/ndcg_rule_reward": -0.02217596210539341, "rewards/rule_reward": 0.025390625, "step": 1303, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.7905425886632312, "grad_norm": 1.514021635055542, "kl": 37.75, "learning_rate": 6.89697054680395e-06, "loss": 0.0378, "reward": 0.0025367848575115204, "reward_std": 0.07456354796886444, "rewards/ndcg_rule_reward": -0.015041340608149767, "rewards/rule_reward": 0.017578125, "step": 1304, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7911488329796909, "grad_norm": 1.8913054466247559, "kl": 18.75, "learning_rate": 6.892429315282802e-06, "loss": 0.0188, "reward": 0.001967897522263229, "reward_std": 0.07482828199863434, "rewards/ndcg_rule_reward": -0.015610228292644024, "rewards/rule_reward": 0.017578125, "step": 1305, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7917550772961504, "grad_norm": 1.9603022336959839, "kl": 27.6875, "learning_rate": 6.887886260923841e-06, "loss": 0.0277, "reward": 0.002780733397230506, "reward_std": 0.08284906670451164, "rewards/ndcg_rule_reward": -0.016750517301261425, "rewards/rule_reward": 0.01953125, "step": 1306, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7923613216126099, "grad_norm": 1.8210726976394653, "kl": 8.25, "learning_rate": 6.8833413881030556e-06, "loss": 0.0082, "reward": 0.0019581365631893277, "reward_std": 0.09162664413452148, "rewards/ndcg_rule_reward": -0.019526238553225994, "rewards/rule_reward": 0.021484375, "step": 1307, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.7929675659290694, "grad_norm": 2.0068681240081787, "kl": 27.875, "learning_rate": 6.878794701198186e-06, "loss": 0.0279, "reward": 0.003401615540497005, "reward_std": 0.13307005912065506, "rewards/ndcg_rule_reward": -0.027848384343087673, "rewards/rule_reward": 0.03125, "step": 1308, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.793573810245529, "grad_norm": 1.879750370979309, "kl": 27.125, "learning_rate": 6.874246204588724e-06, "loss": 0.0271, "reward": 0.0027187246596440673, "reward_std": 0.09972897544503212, "rewards/ndcg_rule_reward": -0.020718775689601898, "rewards/rule_reward": 0.0234375, "step": 1309, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7941800545619885, "grad_norm": 2.9930527210235596, "kl": 29.3125, "learning_rate": 6.869695902655898e-06, "loss": 0.0294, "reward": 0.002878829138353467, "reward_std": 0.07438888028264046, "rewards/ndcg_rule_reward": -0.014699296560138464, "rewards/rule_reward": 0.017578125, "step": 1310, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.794786298878448, "grad_norm": 1.2909537553787231, "kl": 20.1875, "learning_rate": 6.865143799782681e-06, "loss": 0.0202, "reward": 0.0025589592405594885, "reward_std": 0.06612344831228256, "rewards/ndcg_rule_reward": -0.01306604128330946, "rewards/rule_reward": 0.015625, "step": 1311, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7953925431949076, "grad_norm": 1.6792848110198975, "kl": 24.25, "learning_rate": 6.860589900353778e-06, "loss": 0.0243, "reward": 0.0037930081598460674, "reward_std": 0.12445057928562164, "rewards/ndcg_rule_reward": -0.02550386730581522, "rewards/rule_reward": 0.029296875, "step": 1312, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7959987875113671, "grad_norm": 1.714556336402893, "kl": 31.125, "learning_rate": 6.856034208755627e-06, "loss": 0.0311, "reward": 0.0032379108015447855, "reward_std": 0.11629652604460716, "rewards/ndcg_rule_reward": -0.024105838499963284, "rewards/rule_reward": 0.02734375, "step": 1313, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7966050318278266, "grad_norm": 1.4461779594421387, "kl": 34.1875, "learning_rate": 6.851476729376385e-06, "loss": 0.0341, "reward": 0.003346094163134694, "reward_std": 0.10785754024982452, "rewards/ndcg_rule_reward": -0.02204453106969595, "rewards/rule_reward": 0.025390625, "step": 1314, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7972112761442861, "grad_norm": 0.6291335821151733, "kl": 33.5625, "learning_rate": 6.84691746660594e-06, "loss": 0.0336, "reward": 0.002239089284557849, "reward_std": 0.05785801634192467, "rewards/ndcg_rule_reward": -0.011432786006480455, "rewards/rule_reward": 0.013671875, "step": 1315, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.7978175204607457, "grad_norm": 1.7697532176971436, "kl": 38.75, "learning_rate": 6.8423564248358955e-06, "loss": 0.0387, "reward": 0.0035322806797921658, "reward_std": 0.12457150220870972, "rewards/ndcg_rule_reward": -0.02576459478586912, "rewards/rule_reward": 0.029296875, "step": 1316, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.7984237647772052, "grad_norm": 2.3129384517669678, "kl": 35.6875, "learning_rate": 6.837793608459565e-06, "loss": 0.0357, "reward": 0.0024963419418781996, "reward_std": 0.08296485617756844, "rewards/ndcg_rule_reward": -0.017034908290952444, "rewards/rule_reward": 0.01953125, "step": 1317, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7990300090936647, "grad_norm": 1.698219656944275, "kl": 26.65625, "learning_rate": 6.833229021871974e-06, "loss": 0.0267, "reward": 0.0030445458833128214, "reward_std": 0.10801593959331512, "rewards/ndcg_rule_reward": -0.022346080280840397, "rewards/rule_reward": 0.025390625, "step": 1318, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.7996362534101242, "grad_norm": 1.644836187362671, "kl": 30.5, "learning_rate": 6.828662669469852e-06, "loss": 0.0305, "reward": 0.002975119336042553, "reward_std": 0.08275575190782547, "rewards/ndcg_rule_reward": -0.016556130722165108, "rewards/rule_reward": 0.01953125, "step": 1319, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8002424977265838, "grad_norm": 1.7901853322982788, "kl": 30.0625, "learning_rate": 6.82409455565163e-06, "loss": 0.0301, "reward": 0.0031137281330302358, "reward_std": 0.1079300194978714, "rewards/ndcg_rule_reward": -0.022276896983385086, "rewards/rule_reward": 0.025390625, "step": 1320, "token_diversity": 0.42578125 }, { "epoch": 0.8002424977265838, "eval_categorical_diversity": 1.0, "eval_completion_length": 5.0, "eval_kl": 25.78301056338028, "eval_loss": 0.0259491465985775, "eval_reward": 0.001361364821746001, "eval_reward_std": 0.047787889365998794, "eval_rewards/ndcg_rule_reward": -0.009875981222418413, "eval_rewards/rule_reward": 0.011237345950704225, "eval_runtime": 85.0136, "eval_samples_per_second": 53.309, "eval_steps_per_second": 0.059, "eval_token_diversity": 0.3360475352112676, "step": 1320 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8008487420430433, "grad_norm": 1.4887126684188843, "kl": 49.625, "learning_rate": 6.819524684817439e-06, "loss": 0.0496, "reward": 0.004352172021754086, "reward_std": 0.1494680903851986, "rewards/ndcg_rule_reward": -0.03080407902598381, "rewards/rule_reward": 0.03515625, "step": 1321, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 9.564453125, "epoch": 0.8014549863595029, "grad_norm": 1.4907151460647583, "kl": 16.84375, "learning_rate": 6.814953061369094e-06, "loss": 0.0169, "reward": 0.003079733345657587, "reward_std": 0.1079753190279007, "rewards/ndcg_rule_reward": -0.0223108921200037, "rewards/rule_reward": 0.025390625, "step": 1322, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.8020612306759625, "grad_norm": 2.855233907699585, "kl": 33.75, "learning_rate": 6.810379689710106e-06, "loss": 0.0339, "reward": 0.003702885704115033, "reward_std": 0.13290874660015106, "rewards/ndcg_rule_reward": -0.02754711452871561, "rewards/rule_reward": 0.03125, "step": 1323, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.802667474992422, "grad_norm": 16.471481323242188, "kl": 39.78125, "learning_rate": 6.805804574245667e-06, "loss": 0.0397, "reward": 0.0022809699294157326, "reward_std": 0.0662744827568531, "rewards/ndcg_rule_reward": -0.01334403082728386, "rewards/rule_reward": 0.015625, "step": 1324, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.8032737193088815, "grad_norm": 1.47645103931427, "kl": 18.5, "learning_rate": 6.8012277193826485e-06, "loss": 0.0185, "reward": 0.0030673258588649333, "reward_std": 0.09112313762307167, "rewards/ndcg_rule_reward": -0.018417049199342728, "rewards/rule_reward": 0.021484375, "step": 1325, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.803879963625341, "grad_norm": 1.8393445014953613, "kl": 26.15625, "learning_rate": 6.796649129529599e-06, "loss": 0.0262, "reward": 0.003788938745856285, "reward_std": 0.11602365970611572, "rewards/ndcg_rule_reward": -0.02355481218546629, "rewards/rule_reward": 0.02734375, "step": 1326, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8044862079418006, "grad_norm": 2.549724817276001, "kl": 20.21875, "learning_rate": 6.792068809096735e-06, "loss": 0.0202, "reward": 0.0037041387986391783, "reward_std": 0.14137056469917297, "rewards/ndcg_rule_reward": -0.029498986899852753, "rewards/rule_reward": 0.033203125, "step": 1327, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8050924522582601, "grad_norm": 1.781384825706482, "kl": 16.875, "learning_rate": 6.787486762495942e-06, "loss": 0.0169, "reward": 0.0031099661719053984, "reward_std": 0.107969019562006, "rewards/ndcg_rule_reward": -0.022280660457909107, "rewards/rule_reward": 0.025390625, "step": 1328, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8056986965747196, "grad_norm": 1.6167114973068237, "kl": 37.9375, "learning_rate": 6.782902994140771e-06, "loss": 0.0379, "reward": 0.0032879142090678215, "reward_std": 0.11627645418047905, "rewards/ndcg_rule_reward": -0.024055836722254753, "rewards/rule_reward": 0.02734375, "step": 1329, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.8063049408911791, "grad_norm": 1.6350589990615845, "kl": 28.25, "learning_rate": 6.778317508446424e-06, "loss": 0.0283, "reward": 0.004173254827037454, "reward_std": 0.1411331743001938, "rewards/ndcg_rule_reward": -0.029029869474470615, "rewards/rule_reward": 0.033203125, "step": 1330, "token_diversity": 0.29131944444444446 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.8069111852076387, "grad_norm": 4.437723636627197, "kl": 48.625, "learning_rate": 6.773730309829764e-06, "loss": 0.0485, "reward": 0.004758727736771107, "reward_std": 0.14928767085075378, "rewards/ndcg_rule_reward": -0.030397523194551468, "rewards/rule_reward": 0.03515625, "step": 1331, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8075174295240982, "grad_norm": 2.36745023727417, "kl": 54.75, "learning_rate": 6.769141402709305e-06, "loss": 0.0549, "reward": 0.0037727932212874293, "reward_std": 0.12446359544992447, "rewards/ndcg_rule_reward": -0.025524080730974674, "rewards/rule_reward": 0.029296875, "step": 1332, "token_diversity": 0.55859375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8081236738405577, "grad_norm": 1.6678496599197388, "kl": 45.3125, "learning_rate": 6.764550791505198e-06, "loss": 0.0452, "reward": 0.0034763675648719072, "reward_std": 0.11622007936239243, "rewards/ndcg_rule_reward": -0.023867381736636162, "rewards/rule_reward": 0.02734375, "step": 1333, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8087299181570172, "grad_norm": 2.2942874431610107, "kl": 54.375, "learning_rate": 6.759958480639244e-06, "loss": 0.0543, "reward": 0.004438857547938824, "reward_std": 0.13258930295705795, "rewards/ndcg_rule_reward": -0.02681114338338375, "rewards/rule_reward": 0.03125, "step": 1334, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8093361624734768, "grad_norm": 2.0263445377349854, "kl": 40.375, "learning_rate": 6.7553644745348744e-06, "loss": 0.0404, "reward": 0.003318253206089139, "reward_std": 0.11626207083463669, "rewards/ndcg_rule_reward": -0.02402549795806408, "rewards/rule_reward": 0.02734375, "step": 1335, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8099424067899363, "grad_norm": 10.178746223449707, "kl": 45.125, "learning_rate": 6.750768777617161e-06, "loss": 0.045, "reward": 0.0028385529294610023, "reward_std": 0.10811103135347366, "rewards/ndcg_rule_reward": -0.022552071139216423, "rewards/rule_reward": 0.025390625, "step": 1336, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8105486511063958, "grad_norm": 5.092782497406006, "kl": 31.5625, "learning_rate": 6.746171394312799e-06, "loss": 0.0316, "reward": 0.0017827838310040534, "reward_std": 0.06652077659964561, "rewards/ndcg_rule_reward": -0.013842216227203608, "rewards/rule_reward": 0.015625, "step": 1337, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8111548954228555, "grad_norm": 1.195945382118225, "kl": 24.0625, "learning_rate": 6.74157232905011e-06, "loss": 0.024, "reward": 0.003043083706870675, "reward_std": 0.10798735171556473, "rewards/ndcg_rule_reward": -0.02234754152595997, "rewards/rule_reward": 0.025390625, "step": 1338, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.811761139739315, "grad_norm": 1.7292726039886475, "kl": 35.0, "learning_rate": 6.736971586259032e-06, "loss": 0.035, "reward": 0.004371752962470055, "reward_std": 0.14100687205791473, "rewards/ndcg_rule_reward": -0.028831373900175095, "rewards/rule_reward": 0.033203125, "step": 1339, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.8123673840557745, "grad_norm": 1.1472173929214478, "kl": 39.75, "learning_rate": 6.732369170371124e-06, "loss": 0.0398, "reward": 0.003173107688780874, "reward_std": 0.09108924493193626, "rewards/ndcg_rule_reward": -0.018311267718672752, "rewards/rule_reward": 0.021484375, "step": 1340, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.812973628372234, "grad_norm": 2.1876726150512695, "kl": 22.8125, "learning_rate": 6.727765085819554e-06, "loss": 0.0228, "reward": 0.003640237729996443, "reward_std": 0.14136075600981712, "rewards/ndcg_rule_reward": -0.029562887735664845, "rewards/rule_reward": 0.033203125, "step": 1341, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.8135798726886936, "grad_norm": 1.1707006692886353, "kl": 23.3125, "learning_rate": 6.723159337039097e-06, "loss": 0.0232, "reward": 0.0026198087725788355, "reward_std": 0.08292721211910248, "rewards/ndcg_rule_reward": -0.016911441460251808, "rewards/rule_reward": 0.01953125, "step": 1342, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8141861170051531, "grad_norm": 9.441442489624023, "kl": 21.0625, "learning_rate": 6.718551928466133e-06, "loss": 0.0211, "reward": 0.0028113573789596558, "reward_std": 0.08284028619527817, "rewards/ndcg_rule_reward": -0.016719892621040344, "rewards/rule_reward": 0.01953125, "step": 1343, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.8147923613216126, "grad_norm": 6.0719099044799805, "kl": 72.4375, "learning_rate": 6.713942864538638e-06, "loss": 0.0725, "reward": 0.0038134893402457237, "reward_std": 0.14127475768327713, "rewards/ndcg_rule_reward": -0.029389635659754276, "rewards/rule_reward": 0.033203125, "step": 1344, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8153986056380721, "grad_norm": 1.40139639377594, "kl": 28.5625, "learning_rate": 6.7093321496961865e-06, "loss": 0.0286, "reward": 0.004424866987392306, "reward_std": 0.12416438013315201, "rewards/ndcg_rule_reward": -0.024872008711099625, "rewards/rule_reward": 0.029296875, "step": 1345, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8160048499545317, "grad_norm": 0.9836082458496094, "kl": 33.1875, "learning_rate": 6.704719788379936e-06, "loss": 0.0332, "reward": 0.0030772180762141943, "reward_std": 0.09955643489956856, "rewards/ndcg_rule_reward": -0.020360281690955162, "rewards/rule_reward": 0.0234375, "step": 1346, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8166110942709912, "grad_norm": 1.4232819080352783, "kl": 4.67578125, "learning_rate": 6.700105785032638e-06, "loss": 0.0047, "reward": 0.0018913541571237147, "reward_std": 0.08328201249241829, "rewards/ndcg_rule_reward": -0.017639896366745234, "rewards/rule_reward": 0.01953125, "step": 1347, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8172173385874507, "grad_norm": 1.9250365495681763, "kl": 20.5625, "learning_rate": 6.695490144098622e-06, "loss": 0.0206, "reward": 0.002875623991712928, "reward_std": 0.09964920952916145, "rewards/ndcg_rule_reward": -0.020561876706779003, "rewards/rule_reward": 0.0234375, "step": 1348, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.8178235829039103, "grad_norm": 1.8546870946884155, "kl": 43.9375, "learning_rate": 6.690872870023795e-06, "loss": 0.0439, "reward": 0.004304897505789995, "reward_std": 0.13263022527098656, "rewards/ndcg_rule_reward": -0.026945102028548717, "rewards/rule_reward": 0.03125, "step": 1349, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8184298272203698, "grad_norm": 9.901844024658203, "kl": 35.0, "learning_rate": 6.686253967255635e-06, "loss": 0.035, "reward": 0.003057279682252556, "reward_std": 0.09112546592950821, "rewards/ndcg_rule_reward": -0.018427095375955105, "rewards/rule_reward": 0.021484375, "step": 1350, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8190360715368293, "grad_norm": 1.8512334823608398, "kl": 38.5, "learning_rate": 6.681633440243194e-06, "loss": 0.0385, "reward": 0.003989606280811131, "reward_std": 0.12435681372880936, "rewards/ndcg_rule_reward": -0.025307267904281616, "rewards/rule_reward": 0.029296875, "step": 1351, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8196423158532888, "grad_norm": 2.255518913269043, "kl": 40.9375, "learning_rate": 6.6770112934370825e-06, "loss": 0.0409, "reward": 0.0035238361451774836, "reward_std": 0.12457327172160149, "rewards/ndcg_rule_reward": -0.02577303908765316, "rewards/rule_reward": 0.029296875, "step": 1352, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8202485601697485, "grad_norm": 1.5521584749221802, "kl": 36.75, "learning_rate": 6.672387531289476e-06, "loss": 0.0367, "reward": 0.0029235216788947582, "reward_std": 0.0996314212679863, "rewards/ndcg_rule_reward": -0.020513979252427816, "rewards/rule_reward": 0.0234375, "step": 1353, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.820854804486208, "grad_norm": 1.8237786293029785, "kl": 46.25, "learning_rate": 6.667762158254104e-06, "loss": 0.0462, "reward": 0.0035801554331555963, "reward_std": 0.10775984451174736, "rewards/ndcg_rule_reward": -0.021810469217598438, "rewards/rule_reward": 0.025390625, "step": 1354, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8214610488026675, "grad_norm": 2.6310625076293945, "kl": 25.0625, "learning_rate": 6.663135178786247e-06, "loss": 0.025, "reward": 0.0020207875641062856, "reward_std": 0.06639301404356956, "rewards/ndcg_rule_reward": -0.013604212552309036, "rewards/rule_reward": 0.015625, "step": 1355, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.822067293119127, "grad_norm": 2.1625447273254395, "kl": 34.25, "learning_rate": 6.658506597342734e-06, "loss": 0.0344, "reward": 0.00314774620346725, "reward_std": 0.10792243108153343, "rewards/ndcg_rule_reward": -0.022242878563702106, "rewards/rule_reward": 0.025390625, "step": 1356, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8226735374355866, "grad_norm": 1.9592169523239136, "kl": 31.5, "learning_rate": 6.653876418381937e-06, "loss": 0.0315, "reward": 0.0027732020244002342, "reward_std": 0.0997069738805294, "rewards/ndcg_rule_reward": -0.020664297975599766, "rewards/rule_reward": 0.0234375, "step": 1357, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8232797817520461, "grad_norm": 2.6213366985321045, "kl": 29.5, "learning_rate": 6.649244646363767e-06, "loss": 0.0295, "reward": 0.004338360158726573, "reward_std": 0.12421747297048569, "rewards/ndcg_rule_reward": -0.024958514608442783, "rewards/rule_reward": 0.029296875, "step": 1358, "token_diversity": 0.3828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8238860260685056, "grad_norm": 1.9758424758911133, "kl": 33.0625, "learning_rate": 6.644611285749668e-06, "loss": 0.033, "reward": 0.002891880110837519, "reward_std": 0.09963171929121017, "rewards/ndcg_rule_reward": -0.020545619539916515, "rewards/rule_reward": 0.0234375, "step": 1359, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.8244922703849651, "grad_norm": 2.0900204181671143, "kl": 23.1875, "learning_rate": 6.639976341002614e-06, "loss": 0.0232, "reward": 0.0027008389588445425, "reward_std": 0.1081249825656414, "rewards/ndcg_rule_reward": -0.022689785808324814, "rewards/rule_reward": 0.025390625, "step": 1360, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8250985147014247, "grad_norm": 1.403976321220398, "kl": 28.125, "learning_rate": 6.635339816587109e-06, "loss": 0.0282, "reward": 0.0030863327556289732, "reward_std": 0.09958213195204735, "rewards/ndcg_rule_reward": -0.020351167302578688, "rewards/rule_reward": 0.0234375, "step": 1361, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8257047590178842, "grad_norm": 2.1811347007751465, "kl": 34.5625, "learning_rate": 6.630701716969172e-06, "loss": 0.0346, "reward": 0.004280326422303915, "reward_std": 0.15795515477657318, "rewards/ndcg_rule_reward": -0.03282904718071222, "rewards/rule_reward": 0.037109375, "step": 1362, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.8263110033343437, "grad_norm": 13.693873405456543, "kl": 39.28125, "learning_rate": 6.626062046616346e-06, "loss": 0.0391, "reward": 0.002603252127300948, "reward_std": 0.09132938832044601, "rewards/ndcg_rule_reward": -0.018881122581660748, "rewards/rule_reward": 0.021484375, "step": 1363, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8269172476508033, "grad_norm": 5.518312454223633, "kl": 21.875, "learning_rate": 6.621420809997683e-06, "loss": 0.0218, "reward": 0.0037957008462399244, "reward_std": 0.1160532794892788, "rewards/ndcg_rule_reward": -0.023548049852252007, "rewards/rule_reward": 0.02734375, "step": 1364, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.8275234919672628, "grad_norm": 1.9511489868164062, "kl": 34.6875, "learning_rate": 6.616778011583744e-06, "loss": 0.0345, "reward": 0.003004172700457275, "reward_std": 0.09957711026072502, "rewards/ndcg_rule_reward": -0.020433328114449978, "rewards/rule_reward": 0.0234375, "step": 1365, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8281297362837223, "grad_norm": 1.2868928909301758, "kl": 52.5, "learning_rate": 6.612133655846593e-06, "loss": 0.0524, "reward": 0.005284639773890376, "reward_std": 0.15742244571447372, "rewards/ndcg_rule_reward": -0.03182473685592413, "rewards/rule_reward": 0.037109375, "step": 1366, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8287359806001818, "grad_norm": 1.6850792169570923, "kl": 28.3125, "learning_rate": 6.607487747259799e-06, "loss": 0.0283, "reward": 0.002905484987422824, "reward_std": 0.09120722487568855, "rewards/ndcg_rule_reward": -0.018578889779746532, "rewards/rule_reward": 0.021484375, "step": 1367, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8293422249166414, "grad_norm": 1.3157771825790405, "kl": 24.25, "learning_rate": 6.602840290298422e-06, "loss": 0.0242, "reward": 0.0029877606430090964, "reward_std": 0.09118080139160156, "rewards/ndcg_rule_reward": -0.018496614880859852, "rewards/rule_reward": 0.021484375, "step": 1368, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.829948469233101, "grad_norm": 1.7400245666503906, "kl": 37.75, "learning_rate": 6.598191289439016e-06, "loss": 0.0376, "reward": 0.0039687014650553465, "reward_std": 0.12436863780021667, "rewards/ndcg_rule_reward": -0.02532817330211401, "rewards/rule_reward": 0.029296875, "step": 1369, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8305547135495605, "grad_norm": 1.8928688764572144, "kl": 22.0, "learning_rate": 6.593540749159621e-06, "loss": 0.022, "reward": 0.0028960667550563812, "reward_std": 0.09967624768614769, "rewards/ndcg_rule_reward": -0.02054143324494362, "rewards/rule_reward": 0.0234375, "step": 1370, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.83116095786602, "grad_norm": 2.239997386932373, "kl": 22.625, "learning_rate": 6.588888673939758e-06, "loss": 0.0226, "reward": 0.0028826226480305195, "reward_std": 0.09124262630939484, "rewards/ndcg_rule_reward": -0.018601752817630768, "rewards/rule_reward": 0.021484375, "step": 1371, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8317672021824796, "grad_norm": 2.3070297241210938, "kl": 43.625, "learning_rate": 6.5842350682604314e-06, "loss": 0.0437, "reward": 0.004552797647193074, "reward_std": 0.14095215871930122, "rewards/ndcg_rule_reward": -0.028650326654314995, "rewards/rule_reward": 0.033203125, "step": 1372, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8323734464989391, "grad_norm": 3.788999080657959, "kl": 24.5625, "learning_rate": 6.579579936604117e-06, "loss": 0.0246, "reward": 0.0022766138426959515, "reward_std": 0.08310787007212639, "rewards/ndcg_rule_reward": -0.017254636622965336, "rewards/rule_reward": 0.01953125, "step": 1373, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8329796908153986, "grad_norm": 1.7935065031051636, "kl": 29.25, "learning_rate": 6.574923283454757e-06, "loss": 0.0292, "reward": 0.003636166686192155, "reward_std": 0.12453814968466759, "rewards/ndcg_rule_reward": -0.02566070854663849, "rewards/rule_reward": 0.029296875, "step": 1374, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 6.4921875, "epoch": 0.8335859351318582, "grad_norm": 1.4934850931167603, "kl": 30.75, "learning_rate": 6.570265113297765e-06, "loss": 0.0307, "reward": 0.003094348474405706, "reward_std": 0.09954501315951347, "rewards/ndcg_rule_reward": -0.020343152806162834, "rewards/rule_reward": 0.0234375, "step": 1375, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8341921794483177, "grad_norm": 2.5871119499206543, "kl": 20.3125, "learning_rate": 6.565605430620014e-06, "loss": 0.0203, "reward": 0.002789554768241942, "reward_std": 0.09969587996602058, "rewards/ndcg_rule_reward": -0.020647945813834667, "rewards/rule_reward": 0.0234375, "step": 1376, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8347984237647772, "grad_norm": 8.7473783493042, "kl": 38.0, "learning_rate": 6.56094423990983e-06, "loss": 0.038, "reward": 0.0014812950394116342, "reward_std": 0.0413944385945797, "rewards/ndcg_rule_reward": -0.00828433025162667, "rewards/rule_reward": 0.009765625, "step": 1377, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8354046680812367, "grad_norm": 2.1233537197113037, "kl": 28.1875, "learning_rate": 6.556281545656999e-06, "loss": 0.0281, "reward": 0.003465229761786759, "reward_std": 0.10779698565602303, "rewards/ndcg_rule_reward": -0.021925395354628563, "rewards/rule_reward": 0.025390625, "step": 1378, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8360109123976963, "grad_norm": 2.5812010765075684, "kl": 33.8125, "learning_rate": 6.551617352352748e-06, "loss": 0.0339, "reward": 0.0022158378269523382, "reward_std": 0.09151547029614449, "rewards/ndcg_rule_reward": -0.019268536940217018, "rewards/rule_reward": 0.021484375, "step": 1379, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.8366171567141558, "grad_norm": 1.756742238998413, "kl": 25.8125, "learning_rate": 6.546951664489751e-06, "loss": 0.0258, "reward": 0.0030586151406168938, "reward_std": 0.10795166343450546, "rewards/ndcg_rule_reward": -0.022332009859383106, "rewards/rule_reward": 0.025390625, "step": 1380, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8372234010306153, "grad_norm": 1.377677083015442, "kl": 17.6875, "learning_rate": 6.542284486562125e-06, "loss": 0.0177, "reward": 0.0024449509219266474, "reward_std": 0.0830137450248003, "rewards/ndcg_rule_reward": -0.017086300067603588, "rewards/rule_reward": 0.01953125, "step": 1381, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8378296453470748, "grad_norm": 3.436022996902466, "kl": 39.8125, "learning_rate": 6.537615823065413e-06, "loss": 0.0399, "reward": 0.0036382083781063557, "reward_std": 0.11613599956035614, "rewards/ndcg_rule_reward": -0.02370554208755493, "rewards/rule_reward": 0.02734375, "step": 1382, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8384358896635344, "grad_norm": 1.8879426717758179, "kl": 23.625, "learning_rate": 6.532945678496601e-06, "loss": 0.0236, "reward": 0.0032998884562402964, "reward_std": 0.12470120936632156, "rewards/ndcg_rule_reward": -0.025996986776590347, "rewards/rule_reward": 0.029296875, "step": 1383, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.8390421339799939, "grad_norm": 2.3022966384887695, "kl": 45.25, "learning_rate": 6.528274057354091e-06, "loss": 0.0452, "reward": 0.004464423283934593, "reward_std": 0.1578027456998825, "rewards/ndcg_rule_reward": -0.03264495171606541, "rewards/rule_reward": 0.037109375, "step": 1384, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8396483782964534, "grad_norm": 3.7072203159332275, "kl": 54.5, "learning_rate": 6.5236009641377125e-06, "loss": 0.0543, "reward": 0.0036563293542712927, "reward_std": 0.0992760881781578, "rewards/ndcg_rule_reward": -0.019781170412898064, "rewards/rule_reward": 0.0234375, "step": 1385, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 11.7265625, "epoch": 0.840254622612913, "grad_norm": 7.476747035980225, "kl": 35.9375, "learning_rate": 6.5189264033487155e-06, "loss": 0.036, "reward": 0.004178608534857631, "reward_std": 0.1326991245150566, "rewards/ndcg_rule_reward": -0.027071391232311726, "rewards/rule_reward": 0.03125, "step": 1386, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8408608669293726, "grad_norm": 1.3901448249816895, "kl": 20.5, "learning_rate": 6.514250379489754e-06, "loss": 0.0205, "reward": 0.003374819178134203, "reward_std": 0.1162492111325264, "rewards/ndcg_rule_reward": -0.023968931287527084, "rewards/rule_reward": 0.02734375, "step": 1387, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.013671875, "epoch": 0.8414671112458321, "grad_norm": 2.528491735458374, "kl": 33.875, "learning_rate": 6.5095728970649045e-06, "loss": 0.0339, "reward": 0.003727978328242898, "reward_std": 0.12450345978140831, "rewards/ndcg_rule_reward": -0.025568896904587746, "rewards/rule_reward": 0.029296875, "step": 1388, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8420733555622916, "grad_norm": 0.8005887269973755, "kl": 23.40625, "learning_rate": 6.504893960579634e-06, "loss": 0.0234, "reward": 0.002121034893207252, "reward_std": 0.05792530067265034, "rewards/ndcg_rule_reward": -0.011550840688869357, "rewards/rule_reward": 0.013671875, "step": 1389, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8426795998787512, "grad_norm": 2.4755444526672363, "kl": 22.125, "learning_rate": 6.500213574540823e-06, "loss": 0.0221, "reward": 0.002913689415436238, "reward_std": 0.10803530737757683, "rewards/ndcg_rule_reward": -0.02247693482786417, "rewards/rule_reward": 0.025390625, "step": 1390, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8432858441952107, "grad_norm": 1.4080342054367065, "kl": 48.75, "learning_rate": 6.4955317434567426e-06, "loss": 0.0488, "reward": 0.004772457177750766, "reward_std": 0.13241640105843544, "rewards/ndcg_rule_reward": -0.026477541774511337, "rewards/rule_reward": 0.03125, "step": 1391, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8438920885116702, "grad_norm": 2.8951187133789062, "kl": 26.4375, "learning_rate": 6.490848471837052e-06, "loss": 0.0264, "reward": 0.002976293908432126, "reward_std": 0.09961230680346489, "rewards/ndcg_rule_reward": -0.020461206324398518, "rewards/rule_reward": 0.0234375, "step": 1392, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8444983328281297, "grad_norm": 1.8384255170822144, "kl": 43.25, "learning_rate": 6.486163764192805e-06, "loss": 0.0432, "reward": 0.0034686281578615308, "reward_std": 0.11619378998875618, "rewards/ndcg_rule_reward": -0.023875122889876366, "rewards/rule_reward": 0.02734375, "step": 1393, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8451045771445893, "grad_norm": 2.133568286895752, "kl": 36.5, "learning_rate": 6.481477625036436e-06, "loss": 0.0365, "reward": 0.004461147589609027, "reward_std": 0.14097536355257034, "rewards/ndcg_rule_reward": -0.028741978108882904, "rewards/rule_reward": 0.033203125, "step": 1394, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8457108214610488, "grad_norm": 2.118403673171997, "kl": 30.5, "learning_rate": 6.476790058881754e-06, "loss": 0.0306, "reward": 0.002961405203677714, "reward_std": 0.11644138395786285, "rewards/ndcg_rule_reward": -0.024382345378398895, "rewards/rule_reward": 0.02734375, "step": 1395, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8463170657775083, "grad_norm": 3.582537889480591, "kl": 34.0, "learning_rate": 6.472101070243952e-06, "loss": 0.0339, "reward": 0.003416426945477724, "reward_std": 0.09938065335154533, "rewards/ndcg_rule_reward": -0.020021073520183563, "rewards/rule_reward": 0.0234375, "step": 1396, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8469233100939678, "grad_norm": 11.527144432067871, "kl": 74.25, "learning_rate": 6.467410663639583e-06, "loss": 0.0745, "reward": 0.0048659860622137785, "reward_std": 0.1660362333059311, "rewards/ndcg_rule_reward": -0.03419651370495558, "rewards/rule_reward": 0.0390625, "step": 1397, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8475295544104274, "grad_norm": 3.0570173263549805, "kl": 18.59375, "learning_rate": 6.462718843586572e-06, "loss": 0.0186, "reward": 0.0019393358961679041, "reward_std": 0.06644320860505104, "rewards/ndcg_rule_reward": -0.013685664162039757, "rewards/rule_reward": 0.015625, "step": 1398, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.8481357987268869, "grad_norm": 1.9024814367294312, "kl": 35.1875, "learning_rate": 6.458025614604203e-06, "loss": 0.0352, "reward": 0.0036341550294309855, "reward_std": 0.10771618410944939, "rewards/ndcg_rule_reward": -0.021756470203399658, "rewards/rule_reward": 0.025390625, "step": 1399, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8487420430433464, "grad_norm": 2.542799472808838, "kl": 35.25, "learning_rate": 6.453330981213119e-06, "loss": 0.0352, "reward": 0.0021438696421682835, "reward_std": 0.0663187988102436, "rewards/ndcg_rule_reward": -0.013481130357831717, "rewards/rule_reward": 0.015625, "step": 1400, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.849348287359806, "grad_norm": 1.4952696561813354, "kl": 17.9375, "learning_rate": 6.4486349479353135e-06, "loss": 0.018, "reward": 0.002538526605349034, "reward_std": 0.1082093995064497, "rewards/ndcg_rule_reward": -0.022852095775306225, "rewards/rule_reward": 0.025390625, "step": 1401, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8499545316762656, "grad_norm": 9.981120109558105, "kl": 117.75, "learning_rate": 6.44393751929413e-06, "loss": 0.1177, "reward": 0.0046856822445988655, "reward_std": 0.14090928807854652, "rewards/ndcg_rule_reward": -0.028517444618046284, "rewards/rule_reward": 0.033203125, "step": 1402, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.8505607759927251, "grad_norm": 1.9373053312301636, "kl": 20.875, "learning_rate": 6.439238699814256e-06, "loss": 0.0209, "reward": 0.004189390921965241, "reward_std": 0.14116238802671432, "rewards/ndcg_rule_reward": -0.029013734310865402, "rewards/rule_reward": 0.033203125, "step": 1403, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8511670203091846, "grad_norm": 2.6918466091156006, "kl": 30.9375, "learning_rate": 6.434538494021718e-06, "loss": 0.031, "reward": 0.003245356259867549, "reward_std": 0.11631917580962181, "rewards/ndcg_rule_reward": -0.024098394438624382, "rewards/rule_reward": 0.02734375, "step": 1404, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8517732646256442, "grad_norm": 2.999875783920288, "kl": 64.625, "learning_rate": 6.42983690644388e-06, "loss": 0.0645, "reward": 0.004112767404876649, "reward_std": 0.13274765759706497, "rewards/ndcg_rule_reward": -0.027137232944369316, "rewards/rule_reward": 0.03125, "step": 1405, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.8523795089421037, "grad_norm": 1.1924620866775513, "kl": 18.21875, "learning_rate": 6.425133941609432e-06, "loss": 0.0183, "reward": 0.002541977446526289, "reward_std": 0.09977908432483673, "rewards/ndcg_rule_reward": -0.020895523019135, "rewards/rule_reward": 0.0234375, "step": 1406, "token_diversity": 0.49832589285714285 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8529857532585632, "grad_norm": 1.4385658502578735, "kl": 29.3125, "learning_rate": 6.420429604048394e-06, "loss": 0.0294, "reward": 0.0041772781405597925, "reward_std": 0.12427082657814026, "rewards/ndcg_rule_reward": -0.02511959709227085, "rewards/rule_reward": 0.029296875, "step": 1407, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8535919975750227, "grad_norm": 1.2610677480697632, "kl": 31.6875, "learning_rate": 6.415723898292111e-06, "loss": 0.0317, "reward": 0.0035604494623839855, "reward_std": 0.09933620691299438, "rewards/ndcg_rule_reward": -0.019877051003277302, "rewards/rule_reward": 0.0234375, "step": 1408, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 6.935546875, "epoch": 0.8541982418914823, "grad_norm": 1.482980728149414, "kl": 29.5, "learning_rate": 6.411016828873239e-06, "loss": 0.0295, "reward": 0.003681726288050413, "reward_std": 0.11608905345201492, "rewards/ndcg_rule_reward": -0.023662024177610874, "rewards/rule_reward": 0.02734375, "step": 1409, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.8548044862079418, "grad_norm": 1.24285888671875, "kl": 17.4375, "learning_rate": 6.406308400325754e-06, "loss": 0.0174, "reward": 0.002285732072778046, "reward_std": 0.07466440461575985, "rewards/ndcg_rule_reward": -0.015292393509298563, "rewards/rule_reward": 0.017578125, "step": 1410, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.8554107305244013, "grad_norm": 1.5302777290344238, "kl": 40.25, "learning_rate": 6.401598617184939e-06, "loss": 0.0403, "reward": 0.004986130399629474, "reward_std": 0.1491447314620018, "rewards/ndcg_rule_reward": -0.030170119367539883, "rewards/rule_reward": 0.03515625, "step": 1411, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8560169748408608, "grad_norm": 1.7518460750579834, "kl": 21.3125, "learning_rate": 6.39688748398738e-06, "loss": 0.0213, "reward": 0.0035242571029812098, "reward_std": 0.10778028145432472, "rewards/ndcg_rule_reward": -0.02186636859551072, "rewards/rule_reward": 0.025390625, "step": 1412, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8566232191573204, "grad_norm": 1.4717367887496948, "kl": 22.59375, "learning_rate": 6.392175005270964e-06, "loss": 0.0227, "reward": 0.002625826164148748, "reward_std": 0.08292589336633682, "rewards/ndcg_rule_reward": -0.016905424185097218, "rewards/rule_reward": 0.01953125, "step": 1413, "token_diversity": 0.51953125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8572294634737799, "grad_norm": 79.5566177368164, "kl": 203.125, "learning_rate": 6.387461185574875e-06, "loss": 0.203, "reward": 0.0036915555829182267, "reward_std": 0.10768803209066391, "rewards/ndcg_rule_reward": -0.021699069999158382, "rewards/rule_reward": 0.025390625, "step": 1414, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8578357077902394, "grad_norm": 1.9642120599746704, "kl": 54.375, "learning_rate": 6.3827460294395905e-06, "loss": 0.0543, "reward": 0.004072357900440693, "reward_std": 0.1327262818813324, "rewards/ndcg_rule_reward": -0.027177641168236732, "rewards/rule_reward": 0.03125, "step": 1415, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 10.765625, "epoch": 0.858441952106699, "grad_norm": 640.2420654296875, "kl": 956.5, "learning_rate": 6.3780295414068715e-06, "loss": 0.9604, "reward": 0.004724275786429644, "reward_std": 0.14926182478666306, "rewards/ndcg_rule_reward": -0.030431974679231644, "rewards/rule_reward": 0.03515625, "step": 1416, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.8590481964231585, "grad_norm": 2.5956432819366455, "kl": 63.75, "learning_rate": 6.373311726019763e-06, "loss": 0.0638, "reward": 0.004119647666811943, "reward_std": 0.1243254765868187, "rewards/ndcg_rule_reward": -0.025177226401865482, "rewards/rule_reward": 0.029296875, "step": 1417, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.859654440739618, "grad_norm": 4.112171173095703, "kl": 67.625, "learning_rate": 6.368592587822591e-06, "loss": 0.0675, "reward": 0.004576551727950573, "reward_std": 0.140936940908432, "rewards/ndcg_rule_reward": -0.028626574203372, "rewards/rule_reward": 0.033203125, "step": 1418, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8602606850560776, "grad_norm": 14.976384162902832, "kl": 65.0, "learning_rate": 6.363872131360952e-06, "loss": 0.065, "reward": 0.0023768499959260225, "reward_std": 0.07464729622006416, "rewards/ndcg_rule_reward": -0.015201276168227196, "rewards/rule_reward": 0.017578125, "step": 1419, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8608669293725372, "grad_norm": 1.2839325666427612, "kl": 38.25, "learning_rate": 6.3591503611817155e-06, "loss": 0.0383, "reward": 0.0024054974783211946, "reward_std": 0.0745968297123909, "rewards/ndcg_rule_reward": -0.015172627288848162, "rewards/rule_reward": 0.017578125, "step": 1420, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.8614731736889967, "grad_norm": 15.133426666259766, "kl": 68.8125, "learning_rate": 6.354427281833014e-06, "loss": 0.0687, "reward": 0.0034202206879854202, "reward_std": 0.09941066056489944, "rewards/ndcg_rule_reward": -0.02001727931201458, "rewards/rule_reward": 0.0234375, "step": 1421, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8620794180054562, "grad_norm": 1.8297791481018066, "kl": 23.15625, "learning_rate": 6.349702897864243e-06, "loss": 0.0232, "reward": 0.00290447601582855, "reward_std": 0.0996423214673996, "rewards/ndcg_rule_reward": -0.020533024333417416, "rewards/rule_reward": 0.0234375, "step": 1422, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 6.921875, "epoch": 0.8626856623219157, "grad_norm": 74.28675842285156, "kl": 90.5, "learning_rate": 6.344977213826053e-06, "loss": 0.0905, "reward": 0.002975119510665536, "reward_std": 0.08275575190782547, "rewards/ndcg_rule_reward": -0.016556130722165108, "rewards/rule_reward": 0.01953125, "step": 1423, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8632919066383753, "grad_norm": 1.5936349630355835, "kl": 30.84375, "learning_rate": 6.340250234270349e-06, "loss": 0.0309, "reward": 0.0027490181382745504, "reward_std": 0.09972258284687996, "rewards/ndcg_rule_reward": -0.020688481628894806, "rewards/rule_reward": 0.0234375, "step": 1424, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.8638981509548348, "grad_norm": 10.486730575561523, "kl": 37.125, "learning_rate": 6.33552196375028e-06, "loss": 0.0371, "reward": 0.0033112330129370093, "reward_std": 0.11627194285392761, "rewards/ndcg_rule_reward": -0.02403251640498638, "rewards/rule_reward": 0.02734375, "step": 1425, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8645043952712943, "grad_norm": 1.3210636377334595, "kl": 17.09375, "learning_rate": 6.330792406820242e-06, "loss": 0.0171, "reward": 0.0026174141094088554, "reward_std": 0.09975563734769821, "rewards/ndcg_rule_reward": -0.020820085890591145, "rewards/rule_reward": 0.0234375, "step": 1426, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8651106395877539, "grad_norm": 1.3148659467697144, "kl": 24.46875, "learning_rate": 6.326061568035868e-06, "loss": 0.0244, "reward": 0.0030331070302054286, "reward_std": 0.10795659571886063, "rewards/ndcg_rule_reward": -0.0223575197160244, "rewards/rule_reward": 0.025390625, "step": 1427, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8657168839042134, "grad_norm": 2.954150915145874, "kl": 15.875, "learning_rate": 6.32132945195403e-06, "loss": 0.0159, "reward": 0.004024462774395943, "reward_std": 0.14120623469352722, "rewards/ndcg_rule_reward": -0.029178661294281483, "rewards/rule_reward": 0.033203125, "step": 1428, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.8663231282206729, "grad_norm": 1.492448329925537, "kl": 17.28125, "learning_rate": 6.316596063132823e-06, "loss": 0.0173, "reward": 0.003546367515809834, "reward_std": 0.11616566777229309, "rewards/ndcg_rule_reward": -0.023797382600605488, "rewards/rule_reward": 0.02734375, "step": 1429, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8669293725371324, "grad_norm": 1.8542377948760986, "kl": 26.625, "learning_rate": 6.311861406131574e-06, "loss": 0.0266, "reward": 0.003470945404842496, "reward_std": 0.10779570043087006, "rewards/ndcg_rule_reward": -0.021919679827988148, "rewards/rule_reward": 0.025390625, "step": 1430, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.867535616853592, "grad_norm": 2.3239073753356934, "kl": 7.84375, "learning_rate": 6.307125485510829e-06, "loss": 0.0078, "reward": 0.0020448610302992165, "reward_std": 0.06640905141830444, "rewards/ndcg_rule_reward": -0.013580139726400375, "rewards/rule_reward": 0.015625, "step": 1431, "token_diversity": 0.5234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8681418611700515, "grad_norm": 3.2659826278686523, "kl": 39.75, "learning_rate": 6.302388305832351e-06, "loss": 0.0398, "reward": 0.0035352587001398206, "reward_std": 0.11617226898670197, "rewards/ndcg_rule_reward": -0.0238084914162755, "rewards/rule_reward": 0.02734375, "step": 1432, "token_diversity": 0.45703125 }, { "categorical_diversity": 0.875, "completion_length": 7.8828125, "epoch": 0.868748105486511, "grad_norm": 1.4375064373016357, "kl": 29.75, "learning_rate": 6.297649871659118e-06, "loss": 0.0298, "reward": 0.0037627120036631823, "reward_std": 0.12446188181638718, "rewards/ndcg_rule_reward": -0.025534164160490036, "rewards/rule_reward": 0.029296875, "step": 1433, "token_diversity": 0.31826268564356436 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8693543498029705, "grad_norm": 1.0458139181137085, "kl": 30.75, "learning_rate": 6.2929101875553115e-06, "loss": 0.0308, "reward": 0.0028697061352431774, "reward_std": 0.09124808385968208, "rewards/ndcg_rule_reward": -0.018614668399095535, "rewards/rule_reward": 0.021484375, "step": 1434, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8699605941194302, "grad_norm": 1.26529860496521, "kl": 42.875, "learning_rate": 6.288169258086323e-06, "loss": 0.0429, "reward": 0.004060212755575776, "reward_std": 0.11591079086065292, "rewards/ndcg_rule_reward": -0.023283536545932293, "rewards/rule_reward": 0.02734375, "step": 1435, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 12.6875, "epoch": 0.8705668384358897, "grad_norm": 2.0514309406280518, "kl": 46.625, "learning_rate": 6.283427087818742e-06, "loss": 0.0466, "reward": 0.00472576473839581, "reward_std": 0.14086536318063736, "rewards/ndcg_rule_reward": -0.028477360494434834, "rewards/rule_reward": 0.033203125, "step": 1436, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.8711730827523492, "grad_norm": 1.7426625490188599, "kl": 30.0625, "learning_rate": 6.278683681320348e-06, "loss": 0.0301, "reward": 0.0028713487554341555, "reward_std": 0.12490051984786987, "rewards/ndcg_rule_reward": -0.026425526477396488, "rewards/rule_reward": 0.029296875, "step": 1437, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8717793270688088, "grad_norm": 1.4254225492477417, "kl": 29.375, "learning_rate": 6.273939043160118e-06, "loss": 0.0294, "reward": 0.002521936781704426, "reward_std": 0.08299027010798454, "rewards/ndcg_rule_reward": -0.01700931414961815, "rewards/rule_reward": 0.01953125, "step": 1438, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8723855713852683, "grad_norm": 1.809136986732483, "kl": 34.0625, "learning_rate": 6.26919317790821e-06, "loss": 0.0341, "reward": 0.002998779062181711, "reward_std": 0.11639884859323502, "rewards/ndcg_rule_reward": -0.024344971403479576, "rewards/rule_reward": 0.02734375, "step": 1439, "token_diversity": 0.359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8729918157017278, "grad_norm": 1.4434196949005127, "kl": 53.4375, "learning_rate": 6.2644460901359715e-06, "loss": 0.0535, "reward": 0.00339640025049448, "reward_std": 0.09941716492176056, "rewards/ndcg_rule_reward": -0.020041100680828094, "rewards/rule_reward": 0.0234375, "step": 1440, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8735980600181873, "grad_norm": 1.9339650869369507, "kl": 22.859375, "learning_rate": 6.259697784415919e-06, "loss": 0.0228, "reward": 0.00344264751765877, "reward_std": 0.11619649454951286, "rewards/ndcg_rule_reward": -0.023901102133095264, "rewards/rule_reward": 0.02734375, "step": 1441, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8742043043346469, "grad_norm": 1.3957140445709229, "kl": 16.4375, "learning_rate": 6.254948265321744e-06, "loss": 0.0164, "reward": 0.002098019525874406, "reward_std": 0.08315630629658699, "rewards/ndcg_rule_reward": -0.017433229833841324, "rewards/rule_reward": 0.01953125, "step": 1442, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8748105486511064, "grad_norm": 1.809429407119751, "kl": 35.5, "learning_rate": 6.250197537428312e-06, "loss": 0.0354, "reward": 0.0033341359812766314, "reward_std": 0.12466436251997948, "rewards/ndcg_rule_reward": -0.025962737388908863, "rewards/rule_reward": 0.029296875, "step": 1443, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8754167929675659, "grad_norm": 3.3047852516174316, "kl": 35.5, "learning_rate": 6.2454456053116485e-06, "loss": 0.0355, "reward": 0.00306318374350667, "reward_std": 0.10798216983675957, "rewards/ndcg_rule_reward": -0.022327440790832043, "rewards/rule_reward": 0.025390625, "step": 1444, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8760230372840254, "grad_norm": 2.616926670074463, "kl": 31.375, "learning_rate": 6.2406924735489395e-06, "loss": 0.0314, "reward": 0.002987323794513941, "reward_std": 0.1080293133854866, "rewards/ndcg_rule_reward": -0.022403301671147346, "rewards/rule_reward": 0.025390625, "step": 1445, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.876629281600485, "grad_norm": 1.3915938138961792, "kl": 44.875, "learning_rate": 6.235938146718526e-06, "loss": 0.0448, "reward": 0.004710863810032606, "reward_std": 0.14929407835006714, "rewards/ndcg_rule_reward": -0.03044538665562868, "rewards/rule_reward": 0.03515625, "step": 1446, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8772355259169445, "grad_norm": 6.084716796875, "kl": 53.875, "learning_rate": 6.231182629399901e-06, "loss": 0.0538, "reward": 0.0032888471614569426, "reward_std": 0.10787948220968246, "rewards/ndcg_rule_reward": -0.022101777605712414, "rewards/rule_reward": 0.025390625, "step": 1447, "token_diversity": 0.3671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.877841770233404, "grad_norm": 2.628121852874756, "kl": 56.125, "learning_rate": 6.226425926173708e-06, "loss": 0.0562, "reward": 0.00415627658367157, "reward_std": 0.1327347718179226, "rewards/ndcg_rule_reward": -0.027093722485005856, "rewards/rule_reward": 0.03125, "step": 1448, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8784480145498635, "grad_norm": 1.1900577545166016, "kl": 18.75, "learning_rate": 6.2216680416217235e-06, "loss": 0.0188, "reward": 0.0024857890093699098, "reward_std": 0.09139642119407654, "rewards/ndcg_rule_reward": -0.018998586107045412, "rewards/rule_reward": 0.021484375, "step": 1449, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8790542588663232, "grad_norm": 1.1044883728027344, "kl": 27.0, "learning_rate": 6.21690898032687e-06, "loss": 0.0269, "reward": 0.002715689013712108, "reward_std": 0.09130002558231354, "rewards/ndcg_rule_reward": -0.01876868586987257, "rewards/rule_reward": 0.021484375, "step": 1450, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8796605031827827, "grad_norm": 1.857601284980774, "kl": 18.4375, "learning_rate": 6.212148746873201e-06, "loss": 0.0184, "reward": 0.0034256847575306892, "reward_std": 0.12463561072945595, "rewards/ndcg_rule_reward": -0.025871189311146736, "rewards/rule_reward": 0.029296875, "step": 1451, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.8802667474992422, "grad_norm": 1.5304629802703857, "kl": 20.5, "learning_rate": 6.207387345845899e-06, "loss": 0.0205, "reward": 0.0036543395835906267, "reward_std": 0.1413608305156231, "rewards/ndcg_rule_reward": -0.02954878658056259, "rewards/rule_reward": 0.033203125, "step": 1452, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.8808729918157018, "grad_norm": 1.5112932920455933, "kl": 14.78125, "learning_rate": 6.202624781831269e-06, "loss": 0.0148, "reward": 0.0026865064282901585, "reward_std": 0.09976330772042274, "rewards/ndcg_rule_reward": -0.020750994328409433, "rewards/rule_reward": 0.0234375, "step": 1453, "token_diversity": 0.37890625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8814792361321613, "grad_norm": 3.0893218517303467, "kl": 38.5, "learning_rate": 6.19786105941674e-06, "loss": 0.0385, "reward": 0.0049726509023457766, "reward_std": 0.16597406938672066, "rewards/ndcg_rule_reward": -0.03408984839916229, "rewards/rule_reward": 0.0390625, "step": 1454, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8820854804486208, "grad_norm": 12.563277244567871, "kl": 77.25, "learning_rate": 6.193096183190855e-06, "loss": 0.0773, "reward": 0.0028556305915117264, "reward_std": 0.09122199565172195, "rewards/ndcg_rule_reward": -0.018628746271133423, "rewards/rule_reward": 0.021484375, "step": 1455, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8826917247650803, "grad_norm": 1.2749518156051636, "kl": 27.5, "learning_rate": 6.188330157743268e-06, "loss": 0.0275, "reward": 0.0028204622212797403, "reward_std": 0.08284012228250504, "rewards/ndcg_rule_reward": -0.016710788011550903, "rewards/rule_reward": 0.01953125, "step": 1456, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8832979690815399, "grad_norm": 2.0880329608917236, "kl": 39.0, "learning_rate": 6.1835629876647405e-06, "loss": 0.0389, "reward": 0.004366026725620031, "reward_std": 0.14100520685315132, "rewards/ndcg_rule_reward": -0.02883709780871868, "rewards/rule_reward": 0.033203125, "step": 1457, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8839042133979994, "grad_norm": 0.9738495945930481, "kl": 31.1875, "learning_rate": 6.178794677547138e-06, "loss": 0.0312, "reward": 0.0029927690629847348, "reward_std": 0.08275172859430313, "rewards/ndcg_rule_reward": -0.01653848122805357, "rewards/rule_reward": 0.01953125, "step": 1458, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.8845104577144589, "grad_norm": 38.319923400878906, "kl": 85.5, "learning_rate": 6.174025231983417e-06, "loss": 0.0855, "reward": 0.004604072775691748, "reward_std": 0.1325017735362053, "rewards/ndcg_rule_reward": -0.026645926758646965, "rewards/rule_reward": 0.03125, "step": 1459, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8851167020309184, "grad_norm": 2.1362931728363037, "kl": 46.375, "learning_rate": 6.1692546555676416e-06, "loss": 0.0463, "reward": 0.004015225567854941, "reward_std": 0.12435073032975197, "rewards/ndcg_rule_reward": -0.025281650014221668, "rewards/rule_reward": 0.029296875, "step": 1460, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.885722946347378, "grad_norm": 1.6272789239883423, "kl": 23.28125, "learning_rate": 6.1644829528949505e-06, "loss": 0.0232, "reward": 0.0022931781131774187, "reward_std": 0.08309229835867882, "rewards/ndcg_rule_reward": -0.017238072585314512, "rewards/rule_reward": 0.01953125, "step": 1461, "token_diversity": 0.5390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8863291906638375, "grad_norm": 2.4293222427368164, "kl": 44.375, "learning_rate": 6.159710128561574e-06, "loss": 0.0444, "reward": 0.0031091950368136168, "reward_std": 0.09111380204558372, "rewards/ndcg_rule_reward": -0.018375180661678314, "rewards/rule_reward": 0.021484375, "step": 1462, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.886935434980297, "grad_norm": 1.81821608543396, "kl": 40.125, "learning_rate": 6.154936187164825e-06, "loss": 0.0402, "reward": 0.0020626678597182035, "reward_std": 0.06637654453516006, "rewards/ndcg_rule_reward": -0.01356233237311244, "rewards/rule_reward": 0.015625, "step": 1463, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8875416792967566, "grad_norm": 1.254419207572937, "kl": 41.75, "learning_rate": 6.150161133303088e-06, "loss": 0.0417, "reward": 0.0036025827284902334, "reward_std": 0.09928901493549347, "rewards/ndcg_rule_reward": -0.019834917970001698, "rewards/rule_reward": 0.0234375, "step": 1464, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.8881479236132162, "grad_norm": 1.5824193954467773, "kl": 66.75, "learning_rate": 6.145384971575823e-06, "loss": 0.0667, "reward": 0.004101103870198131, "reward_std": 0.12432164698839188, "rewards/ndcg_rule_reward": -0.025195770896971226, "rewards/rule_reward": 0.029296875, "step": 1465, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8887541679296757, "grad_norm": 3.465888023376465, "kl": 49.375, "learning_rate": 6.140607706583552e-06, "loss": 0.0494, "reward": 0.0036842033732682467, "reward_std": 0.11612233519554138, "rewards/ndcg_rule_reward": -0.023659546859562397, "rewards/rule_reward": 0.02734375, "step": 1466, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8893604122461352, "grad_norm": 1.6854628324508667, "kl": 46.25, "learning_rate": 6.1358293429278625e-06, "loss": 0.0461, "reward": 0.003691017278470099, "reward_std": 0.12448592856526375, "rewards/ndcg_rule_reward": -0.025605857372283936, "rewards/rule_reward": 0.029296875, "step": 1467, "token_diversity": 0.359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8899666565625948, "grad_norm": 1.3920819759368896, "kl": 30.625, "learning_rate": 6.131049885211404e-06, "loss": 0.0305, "reward": 0.0024731401936151087, "reward_std": 0.07458123937249184, "rewards/ndcg_rule_reward": -0.015104985795915127, "rewards/rule_reward": 0.017578125, "step": 1468, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8905729008790543, "grad_norm": 1.783219575881958, "kl": 18.765625, "learning_rate": 6.126269338037871e-06, "loss": 0.0188, "reward": 0.0029446150292642415, "reward_std": 0.09962743148207664, "rewards/ndcg_rule_reward": -0.020492885261774063, "rewards/rule_reward": 0.0234375, "step": 1469, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8911791451955138, "grad_norm": 2.7682204246520996, "kl": 44.0, "learning_rate": 6.121487706012018e-06, "loss": 0.0441, "reward": 0.00446285423822701, "reward_std": 0.1410035416483879, "rewards/ndcg_rule_reward": -0.02874027006328106, "rewards/rule_reward": 0.033203125, "step": 1470, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8917853895119733, "grad_norm": 1.2176085710525513, "kl": 26.25, "learning_rate": 6.116704993739635e-06, "loss": 0.0262, "reward": 0.002843121299520135, "reward_std": 0.09965374320745468, "rewards/ndcg_rule_reward": -0.020594379864633083, "rewards/rule_reward": 0.0234375, "step": 1471, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.8923916338284329, "grad_norm": 4.446800231933594, "kl": 62.25, "learning_rate": 6.111921205827559e-06, "loss": 0.0622, "reward": 0.004997107433155179, "reward_std": 0.14914260804653168, "rewards/ndcg_rule_reward": -0.030159142799675465, "rewards/rule_reward": 0.03515625, "step": 1472, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8929978781448924, "grad_norm": 2.4457080364227295, "kl": 38.0, "learning_rate": 6.1071363468836585e-06, "loss": 0.038, "reward": 0.0034007092472165823, "reward_std": 0.11620977520942688, "rewards/ndcg_rule_reward": -0.023943042382597923, "rewards/rule_reward": 0.02734375, "step": 1473, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8936041224613519, "grad_norm": 1.9137831926345825, "kl": 28.375, "learning_rate": 6.102350421516837e-06, "loss": 0.0284, "reward": 0.002747455728240311, "reward_std": 0.08285770192742348, "rewards/ndcg_rule_reward": -0.01678379438817501, "rewards/rule_reward": 0.01953125, "step": 1474, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8942103667778114, "grad_norm": 1.967023491859436, "kl": 44.9375, "learning_rate": 6.097563434337026e-06, "loss": 0.045, "reward": 0.003059133072383702, "reward_std": 0.09112502261996269, "rewards/ndcg_rule_reward": -0.01842524204403162, "rewards/rule_reward": 0.021484375, "step": 1475, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.894816611094271, "grad_norm": 4.025050640106201, "kl": 52.0, "learning_rate": 6.092775389955172e-06, "loss": 0.0521, "reward": 0.002850000048056245, "reward_std": 0.0828244537115097, "rewards/ndcg_rule_reward": -0.016681250650435686, "rewards/rule_reward": 0.01953125, "step": 1476, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8954228554107305, "grad_norm": 1.5565567016601562, "kl": 32.375, "learning_rate": 6.087986292983253e-06, "loss": 0.0324, "reward": 0.0030360075179487467, "reward_std": 0.0995664969086647, "rewards/ndcg_rule_reward": -0.02040149364620447, "rewards/rule_reward": 0.0234375, "step": 1477, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.89602909972719, "grad_norm": 1.412407398223877, "kl": 17.875, "learning_rate": 6.083196148034247e-06, "loss": 0.0179, "reward": 0.003238111035898328, "reward_std": 0.09950157254934311, "rewards/ndcg_rule_reward": -0.02019938826560974, "rewards/rule_reward": 0.0234375, "step": 1478, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8966353440436496, "grad_norm": 4.708575248718262, "kl": 26.75, "learning_rate": 6.078404959722153e-06, "loss": 0.0268, "reward": 0.003783205058425665, "reward_std": 0.12448854744434357, "rewards/ndcg_rule_reward": -0.025513670407235622, "rewards/rule_reward": 0.029296875, "step": 1479, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8972415883601091, "grad_norm": 1.7828466892242432, "kl": 20.25, "learning_rate": 6.073612732661966e-06, "loss": 0.0203, "reward": 0.003390433150343597, "reward_std": 0.10781734064221382, "rewards/ndcg_rule_reward": -0.022000192664563656, "rewards/rule_reward": 0.025390625, "step": 1480, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8978478326765686, "grad_norm": 1.7423393726348877, "kl": 53.5, "learning_rate": 6.068819471469687e-06, "loss": 0.0534, "reward": 0.004737786832265556, "reward_std": 0.14085883274674416, "rewards/ndcg_rule_reward": -0.028465338982641697, "rewards/rule_reward": 0.033203125, "step": 1481, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8984540769930282, "grad_norm": 1.8450158834457397, "kl": 17.375, "learning_rate": 6.064025180762313e-06, "loss": 0.0173, "reward": 0.0023899012012407184, "reward_std": 0.08306640759110451, "rewards/ndcg_rule_reward": -0.01714134868234396, "rewards/rule_reward": 0.01953125, "step": 1482, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.8990603213094878, "grad_norm": 1.6140202283859253, "kl": 43.6875, "learning_rate": 6.05922986515783e-06, "loss": 0.0437, "reward": 0.0036640206235460937, "reward_std": 0.12452198937535286, "rewards/ndcg_rule_reward": -0.02563285455107689, "rewards/rule_reward": 0.029296875, "step": 1483, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.8996665656259473, "grad_norm": 1.7047760486602783, "kl": 30.1875, "learning_rate": 6.05443352927521e-06, "loss": 0.0302, "reward": 0.003619340481236577, "reward_std": 0.1161121241748333, "rewards/ndcg_rule_reward": -0.023724409751594067, "rewards/rule_reward": 0.02734375, "step": 1484, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9002728099424068, "grad_norm": 2.0296123027801514, "kl": 30.375, "learning_rate": 6.049636177734415e-06, "loss": 0.0304, "reward": 0.002543162554502487, "reward_std": 0.08295504748821259, "rewards/ndcg_rule_reward": -0.0169880879111588, "rewards/rule_reward": 0.01953125, "step": 1485, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9008790542588663, "grad_norm": 1.8985209465026855, "kl": 60.375, "learning_rate": 6.044837815156377e-06, "loss": 0.0604, "reward": 0.004999538650736213, "reward_std": 0.14914871752262115, "rewards/ndcg_rule_reward": -0.030156712047755718, "rewards/rule_reward": 0.03515625, "step": 1486, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9014852985753259, "grad_norm": 1.5136524438858032, "kl": 34.71875, "learning_rate": 6.040038446163006e-06, "loss": 0.0347, "reward": 0.0033354932675138116, "reward_std": 0.10786667838692665, "rewards/ndcg_rule_reward": -0.022055131383240223, "rewards/rule_reward": 0.025390625, "step": 1487, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.9020915428917854, "grad_norm": 2.511434316635132, "kl": 30.84375, "learning_rate": 6.035238075377181e-06, "loss": 0.0308, "reward": 0.004078929661773145, "reward_std": 0.1411946825683117, "rewards/ndcg_rule_reward": -0.029124194756150246, "rewards/rule_reward": 0.033203125, "step": 1488, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9026977872082449, "grad_norm": 1.482704520225525, "kl": 34.125, "learning_rate": 6.030436707422746e-06, "loss": 0.0341, "reward": 0.0031982227228581905, "reward_std": 0.10791053622961044, "rewards/ndcg_rule_reward": -0.02219240367412567, "rewards/rule_reward": 0.025390625, "step": 1489, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.9033040315247045, "grad_norm": 4.042055606842041, "kl": 19.125, "learning_rate": 6.0256343469245045e-06, "loss": 0.0191, "reward": 0.0038966084830462933, "reward_std": 0.11602011322975159, "rewards/ndcg_rule_reward": -0.023447141982614994, "rewards/rule_reward": 0.02734375, "step": 1490, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.903910275841164, "grad_norm": 2.3772921562194824, "kl": 31.8125, "learning_rate": 6.020830998508218e-06, "loss": 0.0318, "reward": 0.004407115513458848, "reward_std": 0.14101941138505936, "rewards/ndcg_rule_reward": -0.028796009719371796, "rewards/rule_reward": 0.033203125, "step": 1491, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9045165201576235, "grad_norm": 2.346813440322876, "kl": 13.21875, "learning_rate": 6.016026666800597e-06, "loss": 0.0132, "reward": 0.002623541746288538, "reward_std": 0.09136264026165009, "rewards/ndcg_rule_reward": -0.01886083371937275, "rewards/rule_reward": 0.021484375, "step": 1492, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.905122764474083, "grad_norm": 0.8822610378265381, "kl": 16.9375, "learning_rate": 6.011221356429302e-06, "loss": 0.017, "reward": 0.0017230919911526144, "reward_std": 0.04968747869133949, "rewards/ndcg_rule_reward": -0.009995657950639725, "rewards/rule_reward": 0.01171875, "step": 1493, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9057290087905426, "grad_norm": 1.3180911540985107, "kl": 11.46875, "learning_rate": 6.006415072022933e-06, "loss": 0.0115, "reward": 0.0019104406237602234, "reward_std": 0.06642219051718712, "rewards/ndcg_rule_reward": -0.01371455891057849, "rewards/rule_reward": 0.015625, "step": 1494, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.9063352531070021, "grad_norm": 1.5286132097244263, "kl": 37.0625, "learning_rate": 6.001607818211032e-06, "loss": 0.0371, "reward": 0.0033472025534138083, "reward_std": 0.10782617330551147, "rewards/ndcg_rule_reward": -0.022043422795832157, "rewards/rule_reward": 0.025390625, "step": 1495, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9069414974234616, "grad_norm": 1.5856887102127075, "kl": 21.75, "learning_rate": 5.996799599624069e-06, "loss": 0.0218, "reward": 0.0032137876842170954, "reward_std": 0.09947798401117325, "rewards/ndcg_rule_reward": -0.02022371254861355, "rewards/rule_reward": 0.0234375, "step": 1496, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9075477417399211, "grad_norm": 2.869349479675293, "kl": 17.625, "learning_rate": 5.9919904208934495e-06, "loss": 0.0177, "reward": 0.0021933665266260505, "reward_std": 0.08312897011637688, "rewards/ndcg_rule_reward": -0.01733788475394249, "rewards/rule_reward": 0.01953125, "step": 1497, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9081539860563808, "grad_norm": 1.5789997577667236, "kl": 28.5625, "learning_rate": 5.987180286651503e-06, "loss": 0.0286, "reward": 0.0033776520285755396, "reward_std": 0.10785079374909401, "rewards/ndcg_rule_reward": -0.022012973204255104, "rewards/rule_reward": 0.025390625, "step": 1498, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9087602303728403, "grad_norm": 1.5981597900390625, "kl": 50.25, "learning_rate": 5.9823692015314734e-06, "loss": 0.0503, "reward": 0.004253222141414881, "reward_std": 0.1242465041577816, "rewards/ndcg_rule_reward": -0.025043653324246407, "rewards/rule_reward": 0.029296875, "step": 1499, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9093664746892998, "grad_norm": 1.1062108278274536, "kl": 63.625, "learning_rate": 5.977557170167528e-06, "loss": 0.0636, "reward": 0.003922887844964862, "reward_std": 0.11598987504839897, "rewards/ndcg_rule_reward": -0.023420861922204494, "rewards/rule_reward": 0.02734375, "step": 1500, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9099727190057593, "grad_norm": 2.6885881423950195, "kl": 52.5, "learning_rate": 5.972744197194739e-06, "loss": 0.0525, "reward": 0.0035600285045802593, "reward_std": 0.11616306751966476, "rewards/ndcg_rule_reward": -0.023783721961081028, "rewards/rule_reward": 0.02734375, "step": 1501, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9105789633222189, "grad_norm": 1.851877212524414, "kl": 32.625, "learning_rate": 5.967930287249094e-06, "loss": 0.0327, "reward": 0.0038521781098097563, "reward_std": 0.12444191798567772, "rewards/ndcg_rule_reward": -0.025444697588682175, "rewards/rule_reward": 0.029296875, "step": 1502, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9111852076386784, "grad_norm": 2.2110090255737305, "kl": 47.5, "learning_rate": 5.9631154449674745e-06, "loss": 0.0475, "reward": 0.002860700828023255, "reward_std": 0.09121905639767647, "rewards/ndcg_rule_reward": -0.01862367382273078, "rewards/rule_reward": 0.021484375, "step": 1503, "token_diversity": 0.546875 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.9117914519551379, "grad_norm": 1.2832937240600586, "kl": 37.125, "learning_rate": 5.958299674987663e-06, "loss": 0.0371, "reward": 0.003456087550148368, "reward_std": 0.10780112072825432, "rewards/ndcg_rule_reward": -0.021934537682682276, "rewards/rule_reward": 0.025390625, "step": 1504, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9123976962715975, "grad_norm": 1.2693334817886353, "kl": 57.5, "learning_rate": 5.953482981948338e-06, "loss": 0.0576, "reward": 0.004889814183115959, "reward_std": 0.14078441262245178, "rewards/ndcg_rule_reward": -0.02831331081688404, "rewards/rule_reward": 0.033203125, "step": 1505, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.913003940588057, "grad_norm": 2.675546169281006, "kl": 30.875, "learning_rate": 5.948665370489063e-06, "loss": 0.0309, "reward": 0.0027147798100486398, "reward_std": 0.08290276676416397, "rewards/ndcg_rule_reward": -0.01681647077202797, "rewards/rule_reward": 0.01953125, "step": 1506, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9136101849045165, "grad_norm": 4.064650058746338, "kl": 35.4375, "learning_rate": 5.943846845250292e-06, "loss": 0.0354, "reward": 0.004036139813251793, "reward_std": 0.11594802141189575, "rewards/ndcg_rule_reward": -0.02330761030316353, "rewards/rule_reward": 0.02734375, "step": 1507, "token_diversity": 0.39453125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.914216429220976, "grad_norm": 15.590500831604004, "kl": 58.125, "learning_rate": 5.9390274108733515e-06, "loss": 0.0581, "reward": 0.0033780146623030305, "reward_std": 0.10782872885465622, "rewards/ndcg_rule_reward": -0.02201261091977358, "rewards/rule_reward": 0.025390625, "step": 1508, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9148226735374356, "grad_norm": 2.3713483810424805, "kl": 48.8125, "learning_rate": 5.93420707200045e-06, "loss": 0.0488, "reward": 0.0041780147003009915, "reward_std": 0.11587423831224442, "rewards/ndcg_rule_reward": -0.023165734484791756, "rewards/rule_reward": 0.02734375, "step": 1509, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9154289178538951, "grad_norm": 0.8594769835472107, "kl": 45.625, "learning_rate": 5.929385833274665e-06, "loss": 0.0456, "reward": 0.002760774688795209, "reward_std": 0.07445616275072098, "rewards/ndcg_rule_reward": -0.014817351009696722, "rewards/rule_reward": 0.017578125, "step": 1510, "token_diversity": 0.546875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9160351621703546, "grad_norm": 1.3992317914962769, "kl": 16.3203125, "learning_rate": 5.92456369933994e-06, "loss": 0.0163, "reward": 0.0026967197190970182, "reward_std": 0.07447979971766472, "rewards/ndcg_rule_reward": -0.014881405979394913, "rewards/rule_reward": 0.017578125, "step": 1511, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9166414064868141, "grad_norm": 2.462329149246216, "kl": 43.875, "learning_rate": 5.919740674841083e-06, "loss": 0.0438, "reward": 0.004322878550738096, "reward_std": 0.1494802013039589, "rewards/ndcg_rule_reward": -0.030833372846245766, "rewards/rule_reward": 0.03515625, "step": 1512, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9172476508032738, "grad_norm": 1.691230297088623, "kl": 32.0, "learning_rate": 5.914916764423756e-06, "loss": 0.0321, "reward": 0.0026370876003056765, "reward_std": 0.09135128930211067, "rewards/ndcg_rule_reward": -0.018847286701202393, "rewards/rule_reward": 0.021484375, "step": 1513, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9178538951197333, "grad_norm": 2.624851942062378, "kl": 28.9375, "learning_rate": 5.910091972734477e-06, "loss": 0.0289, "reward": 0.00254057819256559, "reward_std": 0.08298435807228088, "rewards/ndcg_rule_reward": -0.01699067186564207, "rewards/rule_reward": 0.01953125, "step": 1514, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9184601394361928, "grad_norm": 2.142061710357666, "kl": 20.1875, "learning_rate": 5.905266304420616e-06, "loss": 0.0202, "reward": 0.0018791240872815251, "reward_std": 0.07486641779541969, "rewards/ndcg_rule_reward": -0.015699001029133797, "rewards/rule_reward": 0.017578125, "step": 1515, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9190663837526524, "grad_norm": 3.363340139389038, "kl": 48.5, "learning_rate": 5.9004397641303805e-06, "loss": 0.0485, "reward": 0.003981161629781127, "reward_std": 0.12435859814286232, "rewards/ndcg_rule_reward": -0.02531571500003338, "rewards/rule_reward": 0.029296875, "step": 1516, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9196726280691119, "grad_norm": 1.6774789094924927, "kl": 46.5, "learning_rate": 5.895612356512822e-06, "loss": 0.0465, "reward": 0.0040217009373009205, "reward_std": 0.13277886807918549, "rewards/ndcg_rule_reward": -0.027228299528360367, "rewards/rule_reward": 0.03125, "step": 1517, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.9202788723855714, "grad_norm": 1.6139780282974243, "kl": 18.75, "learning_rate": 5.890784086217828e-06, "loss": 0.0187, "reward": 0.002723159035667777, "reward_std": 0.09129206463694572, "rewards/ndcg_rule_reward": -0.018761216662824154, "rewards/rule_reward": 0.021484375, "step": 1518, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9208851167020309, "grad_norm": 2.864368438720703, "kl": 21.125, "learning_rate": 5.885954957896115e-06, "loss": 0.0212, "reward": 0.002353048592340201, "reward_std": 0.08307594433426857, "rewards/ndcg_rule_reward": -0.017178201116621494, "rewards/rule_reward": 0.01953125, "step": 1519, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 10.044921875, "epoch": 0.9214913610184905, "grad_norm": 1.4790759086608887, "kl": 31.5, "learning_rate": 5.881124976199226e-06, "loss": 0.0315, "reward": 0.004337949794717133, "reward_std": 0.12422341853380203, "rewards/ndcg_rule_reward": -0.02495892532169819, "rewards/rule_reward": 0.029296875, "step": 1520, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.92209760533495, "grad_norm": 1.2058497667312622, "kl": 38.125, "learning_rate": 5.876294145779526e-06, "loss": 0.0381, "reward": 0.003318835748359561, "reward_std": 0.09944106265902519, "rewards/ndcg_rule_reward": -0.020118664484471083, "rewards/rule_reward": 0.0234375, "step": 1521, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9227038496514095, "grad_norm": 2.8724989891052246, "kl": 35.9375, "learning_rate": 5.871462471290202e-06, "loss": 0.036, "reward": 0.0026015115436166525, "reward_std": 0.09979930520057678, "rewards/ndcg_rule_reward": -0.02083598915487528, "rewards/rule_reward": 0.0234375, "step": 1522, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.923310093967869, "grad_norm": 2.4424350261688232, "kl": 26.3125, "learning_rate": 5.866629957385248e-06, "loss": 0.0263, "reward": 0.0026450552977621555, "reward_std": 0.09974581003189087, "rewards/ndcg_rule_reward": -0.020792445167899132, "rewards/rule_reward": 0.0234375, "step": 1523, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9239163382843286, "grad_norm": 2.077291250228882, "kl": 25.625, "learning_rate": 5.861796608719468e-06, "loss": 0.0256, "reward": 0.0033725586254149675, "reward_std": 0.12467868626117706, "rewards/ndcg_rule_reward": -0.0259243156760931, "rewards/rule_reward": 0.029296875, "step": 1524, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9245225826007881, "grad_norm": 1.060295581817627, "kl": 23.125, "learning_rate": 5.856962429948472e-06, "loss": 0.0231, "reward": 0.0018390566110610962, "reward_std": 0.07486696913838387, "rewards/ndcg_rule_reward": -0.015739067923277617, "rewards/rule_reward": 0.017578125, "step": 1525, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9251288269172476, "grad_norm": 1.4805350303649902, "kl": 26.625, "learning_rate": 5.852127425728666e-06, "loss": 0.0266, "reward": 0.00348037201911211, "reward_std": 0.11618521437048912, "rewards/ndcg_rule_reward": -0.02386337798088789, "rewards/rule_reward": 0.02734375, "step": 1526, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9257350712337071, "grad_norm": 1.1918240785598755, "kl": 40.125, "learning_rate": 5.847291600717259e-06, "loss": 0.0402, "reward": 0.0034507629461586475, "reward_std": 0.10779992491006851, "rewards/ndcg_rule_reward": -0.021939861588180065, "rewards/rule_reward": 0.025390625, "step": 1527, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9263413155501667, "grad_norm": 2.335951089859009, "kl": 21.296875, "learning_rate": 5.842454959572239e-06, "loss": 0.0213, "reward": 0.0031046081567183137, "reward_std": 0.12476691976189613, "rewards/ndcg_rule_reward": -0.02619226649403572, "rewards/rule_reward": 0.029296875, "step": 1528, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9269475598666262, "grad_norm": 5.999803066253662, "kl": 64.75, "learning_rate": 5.83761750695239e-06, "loss": 0.0646, "reward": 0.003259123011957854, "reward_std": 0.11628478765487671, "rewards/ndcg_rule_reward": -0.024084625765681267, "rewards/rule_reward": 0.02734375, "step": 1529, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.9275538041830858, "grad_norm": 1.0156793594360352, "kl": 31.3125, "learning_rate": 5.832779247517273e-06, "loss": 0.0313, "reward": 0.0020867011044174433, "reward_std": 0.0663311630487442, "rewards/ndcg_rule_reward": -0.0135382991284132, "rewards/rule_reward": 0.015625, "step": 1530, "token_diversity": 0.49503968253968256 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9281600484995454, "grad_norm": 1.4792250394821167, "kl": 49.125, "learning_rate": 5.827940185927227e-06, "loss": 0.0491, "reward": 0.004096536082215607, "reward_std": 0.12432248145341873, "rewards/ndcg_rule_reward": -0.025200339034199715, "rewards/rule_reward": 0.029296875, "step": 1531, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9287662928160049, "grad_norm": 74.36785125732422, "kl": 258.125, "learning_rate": 5.8231003268433635e-06, "loss": 0.258, "reward": 0.004079919075593352, "reward_std": 0.1243344247341156, "rewards/ndcg_rule_reward": -0.025216957554221153, "rewards/rule_reward": 0.029296875, "step": 1532, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9293725371324644, "grad_norm": 7.539971351623535, "kl": 37.140625, "learning_rate": 5.818259674927564e-06, "loss": 0.0373, "reward": 0.0027588524390012026, "reward_std": 0.10813956335186958, "rewards/ndcg_rule_reward": -0.022631771862506866, "rewards/rule_reward": 0.025390625, "step": 1533, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9299787814489239, "grad_norm": 1.57112455368042, "kl": 24.78125, "learning_rate": 5.813418234842467e-06, "loss": 0.0248, "reward": 0.0033969409996643662, "reward_std": 0.11621370911598206, "rewards/ndcg_rule_reward": -0.02394680865108967, "rewards/rule_reward": 0.02734375, "step": 1534, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9305850257653835, "grad_norm": 1.5377658605575562, "kl": 14.59375, "learning_rate": 5.808576011251483e-06, "loss": 0.0146, "reward": 0.00292824738426134, "reward_std": 0.09962452203035355, "rewards/ndcg_rule_reward": -0.020509253721684217, "rewards/rule_reward": 0.0234375, "step": 1535, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.931191270081843, "grad_norm": 1.2784866094589233, "kl": 34.9375, "learning_rate": 5.803733008818763e-06, "loss": 0.035, "reward": 0.0034939260222017765, "reward_std": 0.10779063776135445, "rewards/ndcg_rule_reward": -0.02189669944345951, "rewards/rule_reward": 0.025390625, "step": 1536, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9317975143983025, "grad_norm": 1.9994406700134277, "kl": 0.982421875, "learning_rate": 5.7988892322092175e-06, "loss": 0.001, "reward": 0.003713961225003004, "reward_std": 0.12447911500930786, "rewards/ndcg_rule_reward": -0.02558291330933571, "rewards/rule_reward": 0.029296875, "step": 1537, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.932403758714762, "grad_norm": 2.7147066593170166, "kl": 0.974609375, "learning_rate": 5.794044686088499e-06, "loss": 0.001, "reward": 0.00491524557583034, "reward_std": 0.1491987556219101, "rewards/ndcg_rule_reward": -0.030241006053984165, "rewards/rule_reward": 0.03515625, "step": 1538, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9330100030312216, "grad_norm": 1.78159499168396, "kl": 1.22265625, "learning_rate": 5.789199375123002e-06, "loss": 0.0012, "reward": 0.002611106727272272, "reward_std": 0.08295955508947372, "rewards/ndcg_rule_reward": -0.016920143738389015, "rewards/rule_reward": 0.01953125, "step": 1539, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9336162473476811, "grad_norm": 1.6828632354736328, "kl": 1.185546875, "learning_rate": 5.78435330397986e-06, "loss": 0.0012, "reward": 0.0024599292664788663, "reward_std": 0.09140997007489204, "rewards/ndcg_rule_reward": -0.019024445675313473, "rewards/rule_reward": 0.021484375, "step": 1540, "token_diversity": 0.53515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9342224916641406, "grad_norm": 2.0167291164398193, "kl": 1.58984375, "learning_rate": 5.779506477326933e-06, "loss": 0.0016, "reward": 0.002877049380913377, "reward_std": 0.10803312063217163, "rewards/ndcg_rule_reward": -0.022513574920594692, "rewards/rule_reward": 0.025390625, "step": 1541, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9348287359806002, "grad_norm": 3.337679862976074, "kl": 2.8046875, "learning_rate": 5.774658899832815e-06, "loss": 0.0028, "reward": 0.0036940956488251686, "reward_std": 0.11612269282341003, "rewards/ndcg_rule_reward": -0.02364965435117483, "rewards/rule_reward": 0.02734375, "step": 1542, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9354349802970597, "grad_norm": 2.010976791381836, "kl": 3.6328125, "learning_rate": 5.769810576166818e-06, "loss": 0.0036, "reward": 0.003674389561638236, "reward_std": 0.10769905894994736, "rewards/ndcg_rule_reward": -0.021716236136853695, "rewards/rule_reward": 0.025390625, "step": 1543, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 7.162109375, "epoch": 0.9360412246135192, "grad_norm": 2.2343759536743164, "kl": 3.9609375, "learning_rate": 5.764961510998977e-06, "loss": 0.004, "reward": 0.004458940122276545, "reward_std": 0.14937477558851242, "rewards/ndcg_rule_reward": -0.030697310343384743, "rewards/rule_reward": 0.03515625, "step": 1544, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9366474689299787, "grad_norm": 3.058319330215454, "kl": 8.1875, "learning_rate": 5.76011170900004e-06, "loss": 0.0082, "reward": 0.003144643735140562, "reward_std": 0.12478409707546234, "rewards/ndcg_rule_reward": -0.02615223079919815, "rewards/rule_reward": 0.029296875, "step": 1545, "token_diversity": 0.3515625 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.9372537132464384, "grad_norm": 1.851001501083374, "kl": 14.34375, "learning_rate": 5.755261174841461e-06, "loss": 0.0143, "reward": 0.0034567853435873985, "reward_std": 0.11623168736696243, "rewards/ndcg_rule_reward": -0.0238869646564126, "rewards/rule_reward": 0.02734375, "step": 1546, "token_diversity": 0.31214162844036697 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9378599575628979, "grad_norm": 1.8510979413986206, "kl": 21.625, "learning_rate": 5.750409913195407e-06, "loss": 0.0216, "reward": 0.003918793983757496, "reward_std": 0.12438194453716278, "rewards/ndcg_rule_reward": -0.025378081016242504, "rewards/rule_reward": 0.029296875, "step": 1547, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9384662018793574, "grad_norm": 11.10863971710205, "kl": 62.8125, "learning_rate": 5.745557928734737e-06, "loss": 0.0626, "reward": 0.0028757661348208785, "reward_std": 0.09965012595057487, "rewards/ndcg_rule_reward": -0.020561734214425087, "rewards/rule_reward": 0.0234375, "step": 1548, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9390724461958169, "grad_norm": 2.3634426593780518, "kl": 26.875, "learning_rate": 5.740705226133013e-06, "loss": 0.0269, "reward": 0.0025035995058715343, "reward_std": 0.09139958396553993, "rewards/ndcg_rule_reward": -0.01898077502846718, "rewards/rule_reward": 0.021484375, "step": 1549, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9396786905122765, "grad_norm": 5.407983779907227, "kl": 60.5, "learning_rate": 5.7358518100644845e-06, "loss": 0.0605, "reward": 0.002831167192198336, "reward_std": 0.09965691342949867, "rewards/ndcg_rule_reward": -0.020606333389878273, "rewards/rule_reward": 0.0234375, "step": 1550, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.940284934828736, "grad_norm": 4.673901557922363, "kl": 71.25, "learning_rate": 5.730997685204091e-06, "loss": 0.0712, "reward": 0.003721985500305891, "reward_std": 0.11608035862445831, "rewards/ndcg_rule_reward": -0.023621764965355396, "rewards/rule_reward": 0.02734375, "step": 1551, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.9408911791451955, "grad_norm": 3.364159345626831, "kl": 57.875, "learning_rate": 5.726142856227453e-06, "loss": 0.0578, "reward": 0.004066964611411095, "reward_std": 0.15798348933458328, "rewards/ndcg_rule_reward": -0.033042410388588905, "rewards/rule_reward": 0.037109375, "step": 1552, "token_diversity": 0.46521577380952384 }, { "categorical_diversity": 1.0, "completion_length": 5.720703125, "epoch": 0.941497423461655, "grad_norm": 4.794951915740967, "kl": 74.375, "learning_rate": 5.721287327810867e-06, "loss": 0.0744, "reward": 0.003745805937796831, "reward_std": 0.1160738617181778, "rewards/ndcg_rule_reward": -0.02359794359654188, "rewards/rule_reward": 0.02734375, "step": 1553, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.9421036677781146, "grad_norm": 1.6915518045425415, "kl": 23.75, "learning_rate": 5.716431104631312e-06, "loss": 0.0238, "reward": 0.002718825708143413, "reward_std": 0.0912928469479084, "rewards/ndcg_rule_reward": -0.01876555010676384, "rewards/rule_reward": 0.021484375, "step": 1554, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9427099120945741, "grad_norm": 119.6136245727539, "kl": 326.25, "learning_rate": 5.711574191366427e-06, "loss": 0.3278, "reward": 0.0037934790598228574, "reward_std": 0.12445171922445297, "rewards/ndcg_rule_reward": -0.025503396056592464, "rewards/rule_reward": 0.029296875, "step": 1555, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9433161564110336, "grad_norm": 1.7050203084945679, "kl": 32.4375, "learning_rate": 5.706716592694517e-06, "loss": 0.0324, "reward": 0.0022872034460306168, "reward_std": 0.07467293739318848, "rewards/ndcg_rule_reward": -0.015290921553969383, "rewards/rule_reward": 0.017578125, "step": 1556, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 8.84375, "epoch": 0.9439224007274932, "grad_norm": 2.4291698932647705, "kl": 44.625, "learning_rate": 5.701858313294553e-06, "loss": 0.0447, "reward": 0.003995905281044543, "reward_std": 0.11596515029668808, "rewards/ndcg_rule_reward": -0.023347845301032066, "rewards/rule_reward": 0.02734375, "step": 1557, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9445286450439527, "grad_norm": 2.257294178009033, "kl": 20.1875, "learning_rate": 5.696999357846154e-06, "loss": 0.0202, "reward": 0.0031253995839506388, "reward_std": 0.09949764236807823, "rewards/ndcg_rule_reward": -0.020312100648880005, "rewards/rule_reward": 0.0234375, "step": 1558, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9451348893604122, "grad_norm": 26.580381393432617, "kl": 126.875, "learning_rate": 5.6921397310295965e-06, "loss": 0.1268, "reward": 0.002838936517946422, "reward_std": 0.0996532142162323, "rewards/ndcg_rule_reward": -0.020598563365638256, "rewards/rule_reward": 0.0234375, "step": 1559, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 11.486328125, "epoch": 0.9457411336768717, "grad_norm": 1.4110766649246216, "kl": 19.0625, "learning_rate": 5.687279437525801e-06, "loss": 0.0191, "reward": 0.0034051020629704, "reward_std": 0.09938481822609901, "rewards/ndcg_rule_reward": -0.020032398402690887, "rewards/rule_reward": 0.0234375, "step": 1560, "token_diversity": 0.41015625 }, { "categorical_diversity": 1.0, "completion_length": 7.642578125, "epoch": 0.9463473779933314, "grad_norm": 1.525884747505188, "kl": 26.25, "learning_rate": 5.682418482016329e-06, "loss": 0.0262, "reward": 0.0037747942842543125, "reward_std": 0.10763579607009888, "rewards/ndcg_rule_reward": -0.021615831181406975, "rewards/rule_reward": 0.025390625, "step": 1561, "token_diversity": 0.515625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9469536223097909, "grad_norm": 3.137591600418091, "kl": 13.75, "learning_rate": 5.6775568691833816e-06, "loss": 0.0137, "reward": 0.0035979737294837832, "reward_std": 0.11615313589572906, "rewards/ndcg_rule_reward": -0.023745776154100895, "rewards/rule_reward": 0.02734375, "step": 1562, "token_diversity": 0.37109375 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.9475598666262504, "grad_norm": 2.6292970180511475, "kl": 13.625, "learning_rate": 5.672694603709794e-06, "loss": 0.0136, "reward": 0.003089957870543003, "reward_std": 0.12479907646775246, "rewards/ndcg_rule_reward": -0.02620691806077957, "rewards/rule_reward": 0.029296875, "step": 1563, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9481661109427099, "grad_norm": 2.8624110221862793, "kl": 12.921875, "learning_rate": 5.667831690279027e-06, "loss": 0.0129, "reward": 0.003199863596819341, "reward_std": 0.12476516142487526, "rewards/ndcg_rule_reward": -0.02609701082110405, "rewards/rule_reward": 0.029296875, "step": 1564, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9487723552591695, "grad_norm": 1.9742366075515747, "kl": 20.3125, "learning_rate": 5.662968133575166e-06, "loss": 0.0203, "reward": 0.00499574956484139, "reward_std": 0.14074432104825974, "rewards/ndcg_rule_reward": -0.02820737473666668, "rewards/rule_reward": 0.033203125, "step": 1565, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.949378599575629, "grad_norm": 2.083707809448242, "kl": 13.4375, "learning_rate": 5.65810393828292e-06, "loss": 0.0135, "reward": 0.0036841919645667076, "reward_std": 0.12452026456594467, "rewards/ndcg_rule_reward": -0.025612682104110718, "rewards/rule_reward": 0.029296875, "step": 1566, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9499848438920885, "grad_norm": 1.5474185943603516, "kl": 34.0, "learning_rate": 5.6532391090876084e-06, "loss": 0.034, "reward": 0.003962181508541107, "reward_std": 0.1159784272313118, "rewards/ndcg_rule_reward": -0.023381569422781467, "rewards/rule_reward": 0.02734375, "step": 1567, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 8.123046875, "epoch": 0.9505910882085481, "grad_norm": 4.126443386077881, "kl": 27.6875, "learning_rate": 5.6483736506751606e-06, "loss": 0.0277, "reward": 0.0034869553055614233, "reward_std": 0.09935585036873817, "rewards/ndcg_rule_reward": -0.019950543995946646, "rewards/rule_reward": 0.0234375, "step": 1568, "token_diversity": 0.2980352722772277 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9511973325250076, "grad_norm": 1.857122778892517, "kl": 34.0, "learning_rate": 5.64350756773212e-06, "loss": 0.034, "reward": 0.0028904579230584204, "reward_std": 0.09121336787939072, "rewards/ndcg_rule_reward": -0.018593916669487953, "rewards/rule_reward": 0.021484375, "step": 1569, "token_diversity": 0.4375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9518035768414671, "grad_norm": 1.7313238382339478, "kl": 43.75, "learning_rate": 5.6386408649456205e-06, "loss": 0.0436, "reward": 0.0031625067349523306, "reward_std": 0.09109838306903839, "rewards/ndcg_rule_reward": -0.018321868032217026, "rewards/rule_reward": 0.021484375, "step": 1570, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9524098211579266, "grad_norm": 1.850873589515686, "kl": 36.625, "learning_rate": 5.6337735470034026e-06, "loss": 0.0366, "reward": 0.001899880007840693, "reward_std": 0.06642460078001022, "rewards/ndcg_rule_reward": -0.013725120574235916, "rewards/rule_reward": 0.015625, "step": 1571, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 6.201171875, "epoch": 0.9530160654743862, "grad_norm": 2.0080740451812744, "kl": 26.21875, "learning_rate": 5.628905618593793e-06, "loss": 0.0262, "reward": 0.0026728991651907563, "reward_std": 0.07448629848659039, "rewards/ndcg_rule_reward": -0.01490522618405521, "rewards/rule_reward": 0.017578125, "step": 1572, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9536223097908457, "grad_norm": 1.233588457107544, "kl": 8.03515625, "learning_rate": 5.624037084405708e-06, "loss": 0.008, "reward": 0.001586433034390211, "reward_std": 0.06658928841352463, "rewards/ndcg_rule_reward": -0.014038566499948502, "rewards/rule_reward": 0.015625, "step": 1573, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9542285541073052, "grad_norm": 1.5392768383026123, "kl": 69.375, "learning_rate": 5.6191679491286525e-06, "loss": 0.0692, "reward": 0.004158308613114059, "reward_std": 0.10745060071349144, "rewards/ndcg_rule_reward": -0.02123231627047062, "rewards/rule_reward": 0.025390625, "step": 1574, "token_diversity": 0.52734375 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.9548347984237647, "grad_norm": 1.82088303565979, "kl": 40.6875, "learning_rate": 5.6142982174527005e-06, "loss": 0.0407, "reward": 0.004236564855091274, "reward_std": 0.11582088470458984, "rewards/ndcg_rule_reward": -0.02310718595981598, "rewards/rule_reward": 0.02734375, "step": 1575, "token_diversity": 0.2704858319935691 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9554410427402243, "grad_norm": 2.7101573944091797, "kl": 16.234375, "learning_rate": 5.609427894068508e-06, "loss": 0.0162, "reward": 0.002052865398582071, "reward_std": 0.06637907214462757, "rewards/ndcg_rule_reward": -0.013572134310379624, "rewards/rule_reward": 0.015625, "step": 1576, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9560472870566838, "grad_norm": 1.6533154249191284, "kl": 11.28125, "learning_rate": 5.604556983667299e-06, "loss": 0.0113, "reward": 0.002253156795632094, "reward_std": 0.0831078551709652, "rewards/ndcg_rule_reward": -0.017278093844652176, "rewards/rule_reward": 0.01953125, "step": 1577, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9566535313731434, "grad_norm": 1.9802953004837036, "kl": 22.0, "learning_rate": 5.599685490940866e-06, "loss": 0.022, "reward": 0.0033346801064908504, "reward_std": 0.11626409739255905, "rewards/ndcg_rule_reward": -0.024009070359170437, "rewards/rule_reward": 0.02734375, "step": 1578, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.957259775689603, "grad_norm": 3.7732584476470947, "kl": 43.25, "learning_rate": 5.594813420581554e-06, "loss": 0.0432, "reward": 0.003332345047965646, "reward_std": 0.0994407907128334, "rewards/ndcg_rule_reward": -0.020105155184864998, "rewards/rule_reward": 0.0234375, "step": 1579, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9578660200060625, "grad_norm": 3.776254892349243, "kl": 18.6875, "learning_rate": 5.5899407772822725e-06, "loss": 0.0187, "reward": 0.002335379656869918, "reward_std": 0.06622489169239998, "rewards/ndcg_rule_reward": -0.0132896201685071, "rewards/rule_reward": 0.015625, "step": 1580, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.958472264322522, "grad_norm": 2.007840633392334, "kl": 13.1875, "learning_rate": 5.585067565736478e-06, "loss": 0.0132, "reward": 0.003935155225917697, "reward_std": 0.12437939643859863, "rewards/ndcg_rule_reward": -0.025361718609929085, "rewards/rule_reward": 0.029296875, "step": 1581, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.9590785086389815, "grad_norm": 2.149419069290161, "kl": 21.5625, "learning_rate": 5.580193790638181e-06, "loss": 0.0215, "reward": 0.004399784840643406, "reward_std": 0.1410275101661682, "rewards/ndcg_rule_reward": -0.028803340159356594, "rewards/rule_reward": 0.033203125, "step": 1582, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9596847529554411, "grad_norm": 1.6266772747039795, "kl": 20.0625, "learning_rate": 5.5753194566819234e-06, "loss": 0.02, "reward": 0.0032400055788457394, "reward_std": 0.10789917409420013, "rewards/ndcg_rule_reward": -0.022150619886815548, "rewards/rule_reward": 0.025390625, "step": 1583, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 7.162109375, "epoch": 0.9602909972719006, "grad_norm": 1.1933425664901733, "kl": 26.3125, "learning_rate": 5.5704445685627974e-06, "loss": 0.0263, "reward": 0.003642311319708824, "reward_std": 0.09928006306290627, "rewards/ndcg_rule_reward": -0.019795188680291176, "rewards/rule_reward": 0.0234375, "step": 1584, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 8.36328125, "epoch": 0.9608972415883601, "grad_norm": 2.4560070037841797, "kl": 26.875, "learning_rate": 5.5655691309764225e-06, "loss": 0.0268, "reward": 0.0037203843239694834, "reward_std": 0.09925245493650436, "rewards/ndcg_rule_reward": -0.01971711590886116, "rewards/rule_reward": 0.0234375, "step": 1585, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.9615034859048196, "grad_norm": 4.093371868133545, "kl": 38.375, "learning_rate": 5.560693148618947e-06, "loss": 0.0384, "reward": 0.003368900972418487, "reward_std": 0.0994231328368187, "rewards/ndcg_rule_reward": -0.02006859891116619, "rewards/rule_reward": 0.0234375, "step": 1586, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9621097302212792, "grad_norm": 2.1846230030059814, "kl": 31.5, "learning_rate": 5.555816626187048e-06, "loss": 0.0316, "reward": 0.002485854784026742, "reward_std": 0.09140269830822945, "rewards/ndcg_rule_reward": -0.018998519517481327, "rewards/rule_reward": 0.021484375, "step": 1587, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9627159745377387, "grad_norm": 2.6508100032806396, "kl": 33.6875, "learning_rate": 5.550939568377919e-06, "loss": 0.0337, "reward": 0.0028826225316151977, "reward_std": 0.09124261885881424, "rewards/ndcg_rule_reward": -0.01860175235196948, "rewards/rule_reward": 0.021484375, "step": 1588, "token_diversity": 0.3671875 }, { "categorical_diversity": 1.0, "completion_length": 5.48046875, "epoch": 0.9633222188541982, "grad_norm": 1.9490561485290527, "kl": 26.375, "learning_rate": 5.546061979889272e-06, "loss": 0.0264, "reward": 0.0037814173847436905, "reward_std": 0.12445411086082458, "rewards/ndcg_rule_reward": -0.02551545761525631, "rewards/rule_reward": 0.029296875, "step": 1589, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.9639284631706577, "grad_norm": 1.2573918104171753, "kl": 23.4375, "learning_rate": 5.541183865419328e-06, "loss": 0.0234, "reward": 0.0028441146132536232, "reward_std": 0.09968303143978119, "rewards/ndcg_rule_reward": -0.020593385212123394, "rewards/rule_reward": 0.0234375, "step": 1590, "token_diversity": 0.3566706730769231 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9645347074871173, "grad_norm": 1.6020501852035522, "kl": 33.40625, "learning_rate": 5.536305229666815e-06, "loss": 0.0334, "reward": 0.0026967196026816964, "reward_std": 0.07447979599237442, "rewards/ndcg_rule_reward": -0.014881405048072338, "rewards/rule_reward": 0.017578125, "step": 1591, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9651409518035768, "grad_norm": 3.499124526977539, "kl": 52.6875, "learning_rate": 5.5314260773309634e-06, "loss": 0.0527, "reward": 0.0039720849599689245, "reward_std": 0.11597165837883949, "rewards/ndcg_rule_reward": -0.023371664807200432, "rewards/rule_reward": 0.02734375, "step": 1592, "token_diversity": 0.390625 }, { "categorical_diversity": 1.0, "completion_length": 8.123046875, "epoch": 0.9657471961200363, "grad_norm": 3.8253488540649414, "kl": 42.5, "learning_rate": 5.526546413111502e-06, "loss": 0.0425, "reward": 0.004190331674180925, "reward_std": 0.14115670323371887, "rewards/ndcg_rule_reward": -0.02901279367506504, "rewards/rule_reward": 0.033203125, "step": 1593, "token_diversity": 0.2796966374269006 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.966353440436496, "grad_norm": 2.0028514862060547, "kl": 49.3125, "learning_rate": 5.5216662417086556e-06, "loss": 0.0494, "reward": 0.003840039949864149, "reward_std": 0.11601307988166809, "rewards/ndcg_rule_reward": -0.023503710515797138, "rewards/rule_reward": 0.02734375, "step": 1594, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9669596847529555, "grad_norm": 2.224276542663574, "kl": 37.8125, "learning_rate": 5.516785567823126e-06, "loss": 0.0378, "reward": 0.003935315064154565, "reward_std": 0.13280775398015976, "rewards/ndcg_rule_reward": -0.027314686216413975, "rewards/rule_reward": 0.03125, "step": 1595, "token_diversity": 0.5078125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.967565929069415, "grad_norm": 1.9523271322250366, "kl": 25.625, "learning_rate": 5.511904396156113e-06, "loss": 0.0256, "reward": 0.003630738239735365, "reward_std": 0.10771102458238602, "rewards/ndcg_rule_reward": -0.021759888157248497, "rewards/rule_reward": 0.025390625, "step": 1596, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9681721733858745, "grad_norm": 2.1890926361083984, "kl": 24.53125, "learning_rate": 5.507022731409289e-06, "loss": 0.0246, "reward": 0.0022543971426784992, "reward_std": 0.09993842244148254, "rewards/ndcg_rule_reward": -0.021183103322982788, "rewards/rule_reward": 0.0234375, "step": 1597, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9687784177023341, "grad_norm": 1.8493660688400269, "kl": 23.625, "learning_rate": 5.502140578284801e-06, "loss": 0.0236, "reward": 0.0040821346919983625, "reward_std": 0.1159343495965004, "rewards/ndcg_rule_reward": -0.02326161600649357, "rewards/rule_reward": 0.02734375, "step": 1598, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9693846620187936, "grad_norm": 2.6664059162139893, "kl": 32.625, "learning_rate": 5.497257941485269e-06, "loss": 0.0326, "reward": 0.005041997181251645, "reward_std": 0.14069991558790207, "rewards/ndcg_rule_reward": -0.028161128051579, "rewards/rule_reward": 0.033203125, "step": 1599, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9699909063352531, "grad_norm": 3.0636868476867676, "kl": 19.21875, "learning_rate": 5.492374825713776e-06, "loss": 0.0192, "reward": 0.002527928212657571, "reward_std": 0.11663723364472389, "rewards/ndcg_rule_reward": -0.024815822020173073, "rewards/rule_reward": 0.02734375, "step": 1600, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9705971506517126, "grad_norm": 1.9110753536224365, "kl": 25.5625, "learning_rate": 5.487491235673867e-06, "loss": 0.0256, "reward": 0.0033251720014959574, "reward_std": 0.11623002961277962, "rewards/ndcg_rule_reward": -0.024018578231334686, "rewards/rule_reward": 0.02734375, "step": 1601, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 7.40234375, "epoch": 0.9712033949681722, "grad_norm": 2.083296775817871, "kl": 26.5, "learning_rate": 5.482607176069549e-06, "loss": 0.0265, "reward": 0.0038819205947220325, "reward_std": 0.12442954629659653, "rewards/ndcg_rule_reward": -0.025414954870939255, "rewards/rule_reward": 0.029296875, "step": 1602, "token_diversity": 0.49609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9718096392846317, "grad_norm": 2.4428436756134033, "kl": 27.875, "learning_rate": 5.4777226516052705e-06, "loss": 0.0278, "reward": 0.0036887709284201264, "reward_std": 0.11612150073051453, "rewards/ndcg_rule_reward": -0.023654978722333908, "rewards/rule_reward": 0.02734375, "step": 1603, "token_diversity": 0.41796875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9724158836010912, "grad_norm": 1.7482430934906006, "kl": 31.75, "learning_rate": 5.472837666985941e-06, "loss": 0.0317, "reward": 0.003427898627705872, "reward_std": 0.11620426550507545, "rewards/ndcg_rule_reward": -0.023915850557386875, "rewards/rule_reward": 0.02734375, "step": 1604, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9730221279175508, "grad_norm": 1.474310040473938, "kl": 36.3125, "learning_rate": 5.467952226916903e-06, "loss": 0.0363, "reward": 0.003731548204086721, "reward_std": 0.11607415229082108, "rewards/ndcg_rule_reward": -0.023612202145159245, "rewards/rule_reward": 0.02734375, "step": 1605, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.9736283722340103, "grad_norm": 6.283820152282715, "kl": 66.625, "learning_rate": 5.46306633610394e-06, "loss": 0.0667, "reward": 0.005103281931951642, "reward_std": 0.15750617533922195, "rewards/ndcg_rule_reward": -0.03200609516352415, "rewards/rule_reward": 0.037109375, "step": 1606, "token_diversity": 0.3051581325301205 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9742346165504698, "grad_norm": 1.5143204927444458, "kl": 39.0, "learning_rate": 5.458179999253274e-06, "loss": 0.0389, "reward": 0.0038310764357447624, "reward_std": 0.12444039061665535, "rewards/ndcg_rule_reward": -0.025465798564255238, "rewards/rule_reward": 0.029296875, "step": 1607, "token_diversity": 0.40234375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9748408608669293, "grad_norm": 1.8986055850982666, "kl": 16.90625, "learning_rate": 5.45329322107155e-06, "loss": 0.0169, "reward": 0.002738274692092091, "reward_std": 0.09129786491394043, "rewards/ndcg_rule_reward": -0.01874610036611557, "rewards/rule_reward": 0.021484375, "step": 1608, "token_diversity": 0.4765625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.975447105183389, "grad_norm": 4.058446407318115, "kl": 36.375, "learning_rate": 5.4484060062658466e-06, "loss": 0.0363, "reward": 0.0034815799444913864, "reward_std": 0.10775577649474144, "rewards/ndcg_rule_reward": -0.021909045055508614, "rewards/rule_reward": 0.025390625, "step": 1609, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9760533494998485, "grad_norm": 647.9063110351562, "kl": 645.5625, "learning_rate": 5.443518359543653e-06, "loss": 0.6445, "reward": 0.003363351570442319, "reward_std": 0.11622002720832825, "rewards/ndcg_rule_reward": -0.0239803995937109, "rewards/rule_reward": 0.02734375, "step": 1610, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.976659593816308, "grad_norm": 2.907236337661743, "kl": 26.875, "learning_rate": 5.438630285612881e-06, "loss": 0.0269, "reward": 0.003961105248890817, "reward_std": 0.14122749865055084, "rewards/ndcg_rule_reward": -0.02924202010035515, "rewards/rule_reward": 0.033203125, "step": 1611, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 6.44140625, "epoch": 0.9772658381327675, "grad_norm": 3.466374158859253, "kl": 31.125, "learning_rate": 5.433741789181854e-06, "loss": 0.0312, "reward": 0.0024571437388658524, "reward_std": 0.09983167424798012, "rewards/ndcg_rule_reward": -0.020980357192456722, "rewards/rule_reward": 0.0234375, "step": 1612, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9778720824492271, "grad_norm": 3.130596876144409, "kl": 28.0, "learning_rate": 5.428852874959296e-06, "loss": 0.028, "reward": 0.0032713408581912518, "reward_std": 0.12467837333679199, "rewards/ndcg_rule_reward": -0.02602553367614746, "rewards/rule_reward": 0.029296875, "step": 1613, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.9784783267656866, "grad_norm": 2.5539071559906006, "kl": 22.0625, "learning_rate": 5.423963547654345e-06, "loss": 0.0221, "reward": 0.0027789921732619405, "reward_std": 0.08285064622759819, "rewards/ndcg_rule_reward": -0.01675225794315338, "rewards/rule_reward": 0.01953125, "step": 1614, "token_diversity": 0.38671875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9790845710821461, "grad_norm": 2.0790905952453613, "kl": 21.1875, "learning_rate": 5.419073811976525e-06, "loss": 0.0212, "reward": 0.0031743652652949095, "reward_std": 0.12474071979522705, "rewards/ndcg_rule_reward": -0.026122509501874447, "rewards/rule_reward": 0.029296875, "step": 1615, "token_diversity": 0.40625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9796908153986056, "grad_norm": 5.249034404754639, "kl": 50.125, "learning_rate": 5.41418367263576e-06, "loss": 0.05, "reward": 0.003719676285982132, "reward_std": 0.12450867146253586, "rewards/ndcg_rule_reward": -0.025577198714017868, "rewards/rule_reward": 0.029296875, "step": 1616, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9802970597150652, "grad_norm": 1.3889129161834717, "kl": 17.978515625, "learning_rate": 5.40929313434236e-06, "loss": 0.018, "reward": 0.002843761583790183, "reward_std": 0.10808871686458588, "rewards/ndcg_rule_reward": -0.022546864114701748, "rewards/rule_reward": 0.025390625, "step": 1617, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9809033040315247, "grad_norm": 1.5583751201629639, "kl": 15.5625, "learning_rate": 5.404402201807022e-06, "loss": 0.0155, "reward": 0.003006387792993337, "reward_std": 0.11639957875013351, "rewards/ndcg_rule_reward": -0.024337361566722393, "rewards/rule_reward": 0.02734375, "step": 1618, "token_diversity": 0.46875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9815095483479842, "grad_norm": 1.897169828414917, "kl": 15.375, "learning_rate": 5.39951087974082e-06, "loss": 0.0154, "reward": 0.00327936839312315, "reward_std": 0.1246776357293129, "rewards/ndcg_rule_reward": -0.026017505675554276, "rewards/rule_reward": 0.029296875, "step": 1619, "token_diversity": 0.46484375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9821157926644438, "grad_norm": 4.050771713256836, "kl": 17.5078125, "learning_rate": 5.394619172855203e-06, "loss": 0.0175, "reward": 0.002830801997333765, "reward_std": 0.09968671947717667, "rewards/ndcg_rule_reward": -0.020606698468327522, "rewards/rule_reward": 0.0234375, "step": 1620, "token_diversity": 0.50390625 }, { "categorical_diversity": 1.0, "completion_length": 5.962890625, "epoch": 0.9827220369809033, "grad_norm": 3.315246105194092, "kl": 41.0, "learning_rate": 5.389727085861997e-06, "loss": 0.0411, "reward": 0.004288035212084651, "reward_std": 0.13263476639986038, "rewards/ndcg_rule_reward": -0.026961964555084705, "rewards/rule_reward": 0.03125, "step": 1621, "token_diversity": 0.421875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.9833282812973628, "grad_norm": 1.7773818969726562, "kl": 40.125, "learning_rate": 5.384834623473385e-06, "loss": 0.0401, "reward": 0.004247519886121154, "reward_std": 0.13264522701501846, "rewards/ndcg_rule_reward": -0.027002479881048203, "rewards/rule_reward": 0.03125, "step": 1622, "token_diversity": 0.43359375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9839345256138223, "grad_norm": 1.6448911428451538, "kl": 23.3125, "learning_rate": 5.3799417904019156e-06, "loss": 0.0233, "reward": 0.0028368800412863493, "reward_std": 0.10805393382906914, "rewards/ndcg_rule_reward": -0.022553743794560432, "rewards/rule_reward": 0.025390625, "step": 1623, "token_diversity": 0.484375 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.984540769930282, "grad_norm": 1.8029265403747559, "kl": 22.4375, "learning_rate": 5.375048591360496e-06, "loss": 0.0224, "reward": 0.00253959686961025, "reward_std": 0.09138962253928185, "rewards/ndcg_rule_reward": -0.01894477941095829, "rewards/rule_reward": 0.021484375, "step": 1624, "token_diversity": 0.5223834325396826 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9851470142467414, "grad_norm": 1.8769659996032715, "kl": 14.78125, "learning_rate": 5.370155031062386e-06, "loss": 0.0147, "reward": 0.0019728828920051455, "reward_std": 0.09162273071706295, "rewards/ndcg_rule_reward": -0.01951149175874889, "rewards/rule_reward": 0.021484375, "step": 1625, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.985753258563201, "grad_norm": 1.7032219171524048, "kl": 19.1875, "learning_rate": 5.365261114221189e-06, "loss": 0.0192, "reward": 0.0028856696444563568, "reward_std": 0.09964334964752197, "rewards/ndcg_rule_reward": -0.0205518314614892, "rewards/rule_reward": 0.0234375, "step": 1626, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9863595028796605, "grad_norm": 3.143312692642212, "kl": 49.375, "learning_rate": 5.360366845550856e-06, "loss": 0.0494, "reward": 0.003730706754140556, "reward_std": 0.12450403720140457, "rewards/ndcg_rule_reward": -0.025566168129444122, "rewards/rule_reward": 0.029296875, "step": 1627, "token_diversity": 0.5 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9869657471961201, "grad_norm": 1.576290249824524, "kl": 37.0, "learning_rate": 5.355472229765674e-06, "loss": 0.037, "reward": 0.0037597964983433485, "reward_std": 0.11609972268342972, "rewards/ndcg_rule_reward": -0.023583954200148582, "rewards/rule_reward": 0.02734375, "step": 1628, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9875719915125796, "grad_norm": 3.1643319129943848, "kl": 34.8125, "learning_rate": 5.35057727158027e-06, "loss": 0.0348, "reward": 0.004370391368865967, "reward_std": 0.13263589516282082, "rewards/ndcg_rule_reward": -0.026879608631134033, "rewards/rule_reward": 0.03125, "step": 1629, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9881782358290391, "grad_norm": 2.553255558013916, "kl": 59.375, "learning_rate": 5.345681975709595e-06, "loss": 0.0594, "reward": 0.004058072459883988, "reward_std": 0.11591117829084396, "rewards/ndcg_rule_reward": -0.02328567858785391, "rewards/rule_reward": 0.02734375, "step": 1630, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 7.8828125, "epoch": 0.9887844801454987, "grad_norm": 2.808774709701538, "kl": 24.8125, "learning_rate": 5.3407863468689245e-06, "loss": 0.0248, "reward": 0.0034195896005257964, "reward_std": 0.13309426605701447, "rewards/ndcg_rule_reward": -0.02783041074872017, "rewards/rule_reward": 0.03125, "step": 1631, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9893907244619582, "grad_norm": 1.9822427034378052, "kl": 24.25, "learning_rate": 5.33589038977386e-06, "loss": 0.0243, "reward": 0.002874714555218816, "reward_std": 0.08281901106238365, "rewards/ndcg_rule_reward": -0.016656535677611828, "rewards/rule_reward": 0.01953125, "step": 1632, "token_diversity": 0.4453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9899969687784177, "grad_norm": 5.348130702972412, "kl": 27.625, "learning_rate": 5.330994109140315e-06, "loss": 0.0277, "reward": 0.003164658322930336, "reward_std": 0.0910908654332161, "rewards/ndcg_rule_reward": -0.01831971760839224, "rewards/rule_reward": 0.021484375, "step": 1633, "token_diversity": 0.4609375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9906032130948772, "grad_norm": 8.772811889648438, "kl": 40.375, "learning_rate": 5.326097509684519e-06, "loss": 0.0405, "reward": 0.004500535549595952, "reward_std": 0.15778379887342453, "rewards/ndcg_rule_reward": -0.032608840614557266, "rewards/rule_reward": 0.037109375, "step": 1634, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9912094574113368, "grad_norm": 2.3158905506134033, "kl": 42.625, "learning_rate": 5.321200596123001e-06, "loss": 0.0426, "reward": 0.004579281434416771, "reward_std": 0.14933853596448898, "rewards/ndcg_rule_reward": -0.03057696856558323, "rewards/rule_reward": 0.03515625, "step": 1635, "token_diversity": 0.453125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9918157017277963, "grad_norm": 1.2496801614761353, "kl": 18.5, "learning_rate": 5.316303373172601e-06, "loss": 0.0185, "reward": 0.0026831747964024544, "reward_std": 0.09131169691681862, "rewards/ndcg_rule_reward": -0.01880120113492012, "rewards/rule_reward": 0.021484375, "step": 1636, "token_diversity": 0.44140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9924219460442558, "grad_norm": 3.244997024536133, "kl": 21.5625, "learning_rate": 5.3114058455504516e-06, "loss": 0.0216, "reward": 0.0037953370483592153, "reward_std": 0.13288083672523499, "rewards/ndcg_rule_reward": -0.027454662136733532, "rewards/rule_reward": 0.03125, "step": 1637, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 6.681640625, "epoch": 0.9930281903607153, "grad_norm": 2.486875057220459, "kl": 14.9375, "learning_rate": 5.306508017973982e-06, "loss": 0.0149, "reward": 0.0029663904570043087, "reward_std": 0.09118613973259926, "rewards/ndcg_rule_reward": -0.01851798500865698, "rewards/rule_reward": 0.021484375, "step": 1638, "token_diversity": 0.48828125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9936344346771749, "grad_norm": 3.6713199615478516, "kl": 39.75, "learning_rate": 5.301609895160906e-06, "loss": 0.0397, "reward": 0.004077012417837977, "reward_std": 0.14959758520126343, "rewards/ndcg_rule_reward": -0.031079236418008804, "rewards/rule_reward": 0.03515625, "step": 1639, "token_diversity": 0.4296875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9942406789936344, "grad_norm": 3.240003824234009, "kl": 25.21875, "learning_rate": 5.296711481829227e-06, "loss": 0.0253, "reward": 0.00406029517762363, "reward_std": 0.16641248762607574, "rewards/ndcg_rule_reward": -0.0350022055208683, "rewards/rule_reward": 0.0390625, "step": 1640, "token_diversity": 0.44921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.994846923310094, "grad_norm": 1.97988760471344, "kl": 46.25, "learning_rate": 5.291812782697226e-06, "loss": 0.0463, "reward": 0.003946633310988545, "reward_std": 0.12437368929386139, "rewards/ndcg_rule_reward": -0.025350242853164673, "rewards/rule_reward": 0.029296875, "step": 1641, "token_diversity": 0.42578125 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9954531676265534, "grad_norm": 6.765998840332031, "kl": 46.375, "learning_rate": 5.2869138024834585e-06, "loss": 0.0463, "reward": 0.0027017476968467236, "reward_std": 0.09133873134851456, "rewards/ndcg_rule_reward": -0.01878262870013714, "rewards/rule_reward": 0.021484375, "step": 1642, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9960594119430131, "grad_norm": 2.7638392448425293, "kl": 25.9375, "learning_rate": 5.282014545906751e-06, "loss": 0.0259, "reward": 0.003109479381237179, "reward_std": 0.10794037953019142, "rewards/ndcg_rule_reward": -0.022281145676970482, "rewards/rule_reward": 0.025390625, "step": 1643, "token_diversity": 0.51171875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9966656562594726, "grad_norm": 2.2437963485717773, "kl": 35.9375, "learning_rate": 5.277115017686196e-06, "loss": 0.0359, "reward": 0.004539765301160514, "reward_std": 0.13255612552165985, "rewards/ndcg_rule_reward": -0.026710234582424164, "rewards/rule_reward": 0.03125, "step": 1644, "token_diversity": 0.45703125 }, { "categorical_diversity": 1.0, "completion_length": 5.9609375, "epoch": 0.9972719005759321, "grad_norm": 1.1141301393508911, "kl": 5.578125, "learning_rate": 5.27221522254115e-06, "loss": 0.0056, "reward": 0.0013689286424778402, "reward_std": 0.04988932982087135, "rewards/ndcg_rule_reward": -0.010349821299314499, "rewards/rule_reward": 0.01171875, "step": 1645, "token_diversity": 0.47265625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9978781448923917, "grad_norm": 243.89805603027344, "kl": 58.75, "learning_rate": 5.2673151651912255e-06, "loss": 0.0586, "reward": 0.0029797371244058013, "reward_std": 0.09118770062923431, "rewards/ndcg_rule_reward": -0.018504638224840164, "rewards/rule_reward": 0.021484375, "step": 1646, "token_diversity": 0.48046875 }, { "categorical_diversity": 1.0, "completion_length": 5.240234375, "epoch": 0.9984843892088512, "grad_norm": 1.1187022924423218, "kl": 42.5, "learning_rate": 5.262414850356286e-06, "loss": 0.0425, "reward": 0.0035938852233812213, "reward_std": 0.10772056877613068, "rewards/ndcg_rule_reward": -0.021796739660203457, "rewards/rule_reward": 0.025390625, "step": 1647, "token_diversity": 0.3984375 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9990906335253107, "grad_norm": 20.25361442565918, "kl": 100.25, "learning_rate": 5.257514282756441e-06, "loss": 0.1003, "reward": 0.0032563109416514635, "reward_std": 0.09945870563387871, "rewards/ndcg_rule_reward": -0.020181188359856606, "rewards/rule_reward": 0.0234375, "step": 1648, "token_diversity": 0.4140625 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 0.9996968778417702, "grad_norm": 1.9420148134231567, "kl": 37.4375, "learning_rate": 5.25261346711205e-06, "loss": 0.0375, "reward": 0.0036890562623739243, "reward_std": 0.12448589503765106, "rewards/ndcg_rule_reward": -0.025607818737626076, "rewards/rule_reward": 0.029296875, "step": 1649, "token_diversity": 0.4921875 }, { "categorical_diversity": 1.0, "completion_length": 5.0, "epoch": 1.0, "grad_norm": 1.9420148134231567, "kl": 32.75, "learning_rate": 5.247712408143708e-06, "loss": 0.0327, "reward": 0.002305347938090563, "reward_std": 0.08305943012237549, "rewards/ndcg_rule_reward": -0.017225902527570724, "rewards/rule_reward": 0.01953125, "step": 1650, "token_diversity": 0.5 }, { "epoch": 1.0, "eval_categorical_diversity": 1.0, "eval_completion_length": 5.0, "eval_kl": 15.303697183098592, "eval_loss": 0.015357478521764278, "eval_reward": 0.0013857190968619634, "eval_reward_std": 0.04789249667189491, "eval_rewards/ndcg_rule_reward": -0.009879135657888902, "eval_rewards/rule_reward": 0.011264854753521127, "eval_runtime": 85.0364, "eval_samples_per_second": 53.295, "eval_steps_per_second": 0.059, "eval_token_diversity": 0.3484540052816901, "step": 1650 } ], "logging_steps": 1, "max_steps": 3300, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 330, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }