diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,515 +1,5023 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9790209790209791, + "epoch": 0.9993002099370188, "eval_steps": 500, - "global_step": 35, + "global_step": 357, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "completion_length": 1266.1309509277344, - "epoch": 0.027972027972027972, - "grad_norm": 0.8290109038352966, + "completion_length": 1212.7916870117188, + "epoch": 0.0027991602519244225, + "grad_norm": 0.5663476586341858, "kl": 0.0, - "learning_rate": 5e-06, + "learning_rate": 5.555555555555555e-07, "loss": 0.0, - "reward": 0.174851194024086, - "reward_std": 0.28419411182403564, - "rewards/accuracy_reward": 0.08630952564999461, - "rewards/format_reward": 0.0059523810632526875, - "rewards/tag_count_reward": 0.08258928637951612, + "reward": 0.1912202388048172, + "reward_std": 0.29104578867554665, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.057291666977107525, "step": 1 }, { - "completion_length": 1241.7916870117188, - "epoch": 0.055944055944055944, - "grad_norm": 0.3183291256427765, + "completion_length": 1469.4702758789062, + "epoch": 0.005598320503848845, + "grad_norm": 0.17836421728134155, "kl": 0.0, - "learning_rate": 1e-05, + "learning_rate": 1.111111111111111e-06, "loss": 0.0, - "reward": 0.11904762405902147, - "reward_std": 0.22773383557796478, - "rewards/accuracy_reward": 0.0505952388048172, + "reward": 0.13839286006987095, + "reward_std": 0.2532867342233658, + "rewards/accuracy_reward": 0.06845238339155912, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.06845238152891397, + "rewards/tag_count_reward": 0.0699404776096344, "step": 2 }, { - "completion_length": 1270.7619323730469, - "epoch": 0.08391608391608392, - "grad_norm": 0.4639536738395691, - "kl": 5.7220458984375e-05, - "learning_rate": 1.5000000000000002e-05, + "completion_length": 1186.4702453613281, + "epoch": 0.008397480755773267, + "grad_norm": 0.4373953640460968, + "kl": 5.626678466796875e-05, + "learning_rate": 1.6666666666666667e-06, "loss": 0.0, - "reward": 0.1688988134264946, - "reward_std": 0.27802807092666626, - "rewards/accuracy_reward": 0.11011905036866665, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.05877976305782795, + "reward": 0.17633929289877415, + "reward_std": 0.29880572110414505, + "rewards/accuracy_reward": 0.10714285913854837, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.06622024066746235, "step": 3 }, { - "completion_length": 1434.8779907226562, - "epoch": 0.11188811188811189, - "grad_norm": 0.3119446039199829, - "kl": 0.0002663135528564453, - "learning_rate": 2e-05, + "completion_length": 1353.8512268066406, + "epoch": 0.01119664100769769, + "grad_norm": 0.3016909658908844, + "kl": 5.543231964111328e-05, + "learning_rate": 2.222222222222222e-06, "loss": 0.0, - "reward": 0.1815476194024086, - "reward_std": 0.31393999606370926, - "rewards/accuracy_reward": 0.1041666679084301, + "reward": 0.13616071827709675, + "reward_std": 0.22534876689314842, + "rewards/accuracy_reward": 0.09523809910751879, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.07738095335662365, + "rewards/tag_count_reward": 0.04092261986806989, "step": 4 }, { - "completion_length": 1392.0565795898438, - "epoch": 0.13986013986013987, - "grad_norm": 0.2788020074367523, - "kl": 0.00302886962890625, - "learning_rate": 1.994869323391895e-05, - "loss": 0.0001, - "reward": 0.1830357201397419, - "reward_std": 0.3282713554799557, - "rewards/accuracy_reward": 0.05059523903764784, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.1324404776096344, + "completion_length": 1250.9702453613281, + "epoch": 0.013995801259622114, + "grad_norm": 0.9041264653205872, + "kl": 7.56382942199707e-05, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.0, + "reward": 0.13020833767950535, + "reward_std": 0.22718050703406334, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.04389881109818816, "step": 5 }, { - "completion_length": 1370.827392578125, - "epoch": 0.16783216783216784, - "grad_norm": 0.42689841985702515, - "kl": 0.024688720703125, - "learning_rate": 1.9795299412524948e-05, - "loss": 0.001, - "reward": 0.3221726268529892, - "reward_std": 0.41605522483587265, - "rewards/accuracy_reward": 0.06250000116415322, + "completion_length": 1465.3363037109375, + "epoch": 0.016794961511546535, + "grad_norm": 0.7786433696746826, + "kl": 0.00011903047561645508, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0, + "reward": 0.1078869067132473, + "reward_std": 0.22585948556661606, + "rewards/accuracy_reward": 0.06845238339155912, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.2596726305782795, + "rewards/tag_count_reward": 0.03943452425301075, "step": 6 }, { - "completion_length": 1480.6607360839844, - "epoch": 0.1958041958041958, - "grad_norm": 1.6140049695968628, - "kl": 0.090087890625, - "learning_rate": 1.954139256400049e-05, - "loss": 0.0036, - "reward": 0.5044642835855484, - "reward_std": 0.4936201646924019, - "rewards/accuracy_reward": 0.0476190485060215, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.4568452462553978, + "completion_length": 1294.5952453613281, + "epoch": 0.01959412176347096, + "grad_norm": 0.21782070398330688, + "kl": 7.742643356323242e-05, + "learning_rate": 3.88888888888889e-06, + "loss": 0.0, + "reward": 0.2224702388048172, + "reward_std": 0.2926863096654415, + "rewards/accuracy_reward": 0.12500000232830644, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.09151786100119352, "step": 7 }, { - "completion_length": 1408.3869323730469, - "epoch": 0.22377622377622378, - "grad_norm": 0.272751122713089, - "kl": 0.020416259765625, - "learning_rate": 1.918957811620231e-05, - "loss": 0.0008, - "reward": 0.6250000149011612, - "reward_std": 0.4978248253464699, - "rewards/accuracy_reward": 0.059523810632526875, + "completion_length": 1446.3810119628906, + "epoch": 0.02239328201539538, + "grad_norm": 0.21054071187973022, + "kl": 8.07642936706543e-05, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0, + "reward": 0.1510416716337204, + "reward_std": 0.2783088833093643, + "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5654762089252472, + "rewards/tag_count_reward": 0.06175595335662365, "step": 8 }, { - "completion_length": 1243.511962890625, - "epoch": 0.2517482517482518, - "grad_norm": 0.3416856825351715, - "kl": 0.0301513671875, - "learning_rate": 1.8743466161445823e-05, - "loss": 0.0012, - "reward": 0.625, - "reward_std": 0.4570515900850296, - "rewards/accuracy_reward": 0.026785715017467737, + "completion_length": 1282.6310119628906, + "epoch": 0.025192442267319804, + "grad_norm": 0.5751554369926453, + "kl": 0.00016450881958007812, + "learning_rate": 5e-06, + "loss": 0.0, + "reward": 0.174851194024086, + "reward_std": 0.2629920206964016, + "rewards/accuracy_reward": 0.11607143119908869, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5982142835855484, + "rewards/tag_count_reward": 0.05877976305782795, "step": 9 }, { - "completion_length": 1153.3899230957031, - "epoch": 0.27972027972027974, - "grad_norm": 0.29056915640830994, - "kl": 0.0340576171875, - "learning_rate": 1.8207634412072765e-05, - "loss": 0.0014, - "reward": 0.758928582072258, - "reward_std": 0.47312967479228973, - "rewards/accuracy_reward": 0.07142857369035482, + "completion_length": 1256.5595703125, + "epoch": 0.02799160251924423, + "grad_norm": 0.5731971263885498, + "kl": 0.00046133995056152344, + "learning_rate": 5.555555555555557e-06, + "loss": 0.0, + "reward": 0.16741071827709675, + "reward_std": 0.2973426282405853, + "rewards/accuracy_reward": 0.1190476231276989, "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.6845238208770752, + "rewards/tag_count_reward": 0.04538690624758601, "step": 10 }, { - "completion_length": 1052.8690490722656, - "epoch": 0.3076923076923077, - "grad_norm": 0.42435142397880554, - "kl": 0.0460205078125, - "learning_rate": 1.758758122692791e-05, - "loss": 0.0018, - "reward": 0.8861607164144516, - "reward_std": 0.4194239601492882, - "rewards/accuracy_reward": 0.09821428777649999, + "completion_length": 1325.2887268066406, + "epoch": 0.03079076277116865, + "grad_norm": 1.7843555212020874, + "kl": 0.0006890296936035156, + "learning_rate": 6.111111111111112e-06, + "loss": 0.0, + "reward": 0.1562500074505806, + "reward_std": 0.2497977502644062, + "rewards/accuracy_reward": 0.08035714295692742, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7879464477300644, + "rewards/tag_count_reward": 0.07589285913854837, "step": 11 }, { - "completion_length": 908.1636962890625, - "epoch": 0.3356643356643357, - "grad_norm": 0.4831395149230957, - "kl": 0.078125, - "learning_rate": 1.688966919075687e-05, - "loss": 0.0031, - "reward": 0.9203869104385376, - "reward_std": 0.34339290857315063, - "rewards/accuracy_reward": 0.06547619216144085, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8549107164144516, + "completion_length": 1345.6369323730469, + "epoch": 0.03358992302309307, + "grad_norm": 0.34509122371673584, + "kl": 0.0025167465209960938, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0001, + "reward": 0.13169643096625805, + "reward_std": 0.24966860935091972, + "rewards/accuracy_reward": 0.04761904804036021, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.08110119123011827, "step": 12 }, { - "completion_length": 695.6875, - "epoch": 0.36363636363636365, - "grad_norm": 1.9463610649108887, - "kl": 0.1634521484375, - "learning_rate": 1.612105982547663e-05, - "loss": 0.0065, - "reward": 0.9203869104385376, - "reward_std": 0.3368927761912346, - "rewards/accuracy_reward": 0.06547619309276342, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.8519345372915268, + "completion_length": 1360.7559814453125, + "epoch": 0.0363890832750175, + "grad_norm": 0.33581575751304626, + "kl": 0.0037994384765625, + "learning_rate": 7.222222222222223e-06, + "loss": 0.0002, + "reward": 0.2023809514939785, + "reward_std": 0.3145877979695797, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.07738095242530107, "step": 13 }, { - "completion_length": 699.0357208251953, - "epoch": 0.3916083916083916, - "grad_norm": 0.49192556738853455, - "kl": 0.09423828125, - "learning_rate": 1.5289640103269626e-05, - "loss": 0.0038, - "reward": 0.9069940745830536, - "reward_std": 0.3137018084526062, - "rewards/accuracy_reward": 0.056547619635239244, + "completion_length": 1276.7916564941406, + "epoch": 0.03918824352694192, + "grad_norm": 0.22585485875606537, + "kl": 0.0034046173095703125, + "learning_rate": 7.77777777777778e-06, + "loss": 0.0001, + "reward": 0.21279762126505375, + "reward_std": 0.303580678999424, + "rewards/accuracy_reward": 0.12797619123011827, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8504464477300644, + "rewards/tag_count_reward": 0.08482143213041127, "step": 14 }, { - "completion_length": 588.755973815918, - "epoch": 0.4195804195804196, - "grad_norm": 0.29993587732315063, - "kl": 0.08349609375, - "learning_rate": 1.4403941515576344e-05, - "loss": 0.0033, - "reward": 1.0260416716337204, - "reward_std": 0.2692214325070381, - "rewards/accuracy_reward": 0.0922619067132473, + "completion_length": 1444.0654907226562, + "epoch": 0.04198740377886634, + "grad_norm": 1.2288179397583008, + "kl": 0.04129791259765625, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0016, + "reward": 0.2537202425301075, + "reward_std": 0.36673635244369507, + "rewards/accuracy_reward": 0.11011904943734407, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.933779776096344, + "rewards/tag_count_reward": 0.1436011902987957, "step": 15 }, { - "completion_length": 676.077392578125, - "epoch": 0.44755244755244755, - "grad_norm": 0.32921651005744934, - "kl": 0.093017578125, - "learning_rate": 1.3473052528448203e-05, - "loss": 0.0037, - "reward": 0.9352678656578064, - "reward_std": 0.257955402135849, - "rewards/accuracy_reward": 0.026785715017467737, - "rewards/format_reward": 0.0059523810632526875, - "rewards/tag_count_reward": 0.9025297909975052, + "completion_length": 1456.8750305175781, + "epoch": 0.04478656403079076, + "grad_norm": 0.40248867869377136, + "kl": 0.021881103515625, + "learning_rate": 8.888888888888888e-06, + "loss": 0.0009, + "reward": 0.2410714328289032, + "reward_std": 0.3852364495396614, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.1488095261156559, "step": 16 }, { - "completion_length": 677.7113189697266, - "epoch": 0.4755244755244755, - "grad_norm": 0.7335661053657532, - "kl": 0.0714111328125, - "learning_rate": 1.2506525322587207e-05, - "loss": 0.0029, - "reward": 0.9575892984867096, - "reward_std": 0.29887210205197334, - "rewards/accuracy_reward": 0.050595239736139774, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.9040178805589676, + "completion_length": 1395.2292175292969, + "epoch": 0.04758572428271519, + "grad_norm": 0.8993555307388306, + "kl": 0.01361083984375, + "learning_rate": 9.444444444444445e-06, + "loss": 0.0005, + "reward": 0.26116071827709675, + "reward_std": 0.3668462857604027, + "rewards/accuracy_reward": 0.12797619309276342, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.12723214365541935, "step": 17 }, { - "completion_length": 654.7976379394531, - "epoch": 0.5034965034965035, - "grad_norm": 0.29916274547576904, - "kl": 0.072021484375, - "learning_rate": 1.1514277775045768e-05, - "loss": 0.0029, - "reward": 1.0141368955373764, - "reward_std": 0.20955239236354828, - "rewards/accuracy_reward": 0.06250000093132257, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.948660746216774, + "completion_length": 1457.8720397949219, + "epoch": 0.05038488453463961, + "grad_norm": 0.38311967253685, + "kl": 0.01342010498046875, + "learning_rate": 1e-05, + "loss": 0.0005, + "reward": 0.3355654776096344, + "reward_std": 0.4032140150666237, + "rewards/accuracy_reward": 0.1577381007373333, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1778273843228817, "step": 18 }, { - "completion_length": 590.1339416503906, - "epoch": 0.5314685314685315, - "grad_norm": 0.29927486181259155, - "kl": 0.1075439453125, - "learning_rate": 1.0506491688387128e-05, - "loss": 0.0043, - "reward": 1.0126488208770752, - "reward_std": 0.1801503635942936, - "rewards/accuracy_reward": 0.044642859138548374, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96800597012043, + "completion_length": 1412.0327758789062, + "epoch": 0.05318404478656403, + "grad_norm": 0.3383859395980835, + "kl": 0.0170135498046875, + "learning_rate": 1.0555555555555557e-05, + "loss": 0.0007, + "reward": 0.2589285783469677, + "reward_std": 0.3808591440320015, + "rewards/accuracy_reward": 0.09226190787740052, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.16369047854095697, "step": 19 }, { - "completion_length": 628.7232208251953, - "epoch": 0.5594405594405595, - "grad_norm": 0.7065412998199463, - "kl": 0.158447265625, - "learning_rate": 9.493508311612874e-06, - "loss": 0.0063, - "reward": 1.0230654925107956, - "reward_std": 0.24343841522932053, - "rewards/accuracy_reward": 0.07738095335662365, + "completion_length": 1470.6607360839844, + "epoch": 0.05598320503848846, + "grad_norm": 0.37579816579818726, + "kl": 0.0135040283203125, + "learning_rate": 1.1111111111111113e-05, + "loss": 0.0005, + "reward": 0.365327388048172, + "reward_std": 0.4173513501882553, + "rewards/accuracy_reward": 0.1130952425301075, "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.942708358168602, + "rewards/tag_count_reward": 0.2492559552192688, "step": 20 }, { - "completion_length": 514.2440567016602, - "epoch": 0.5874125874125874, - "grad_norm": 1.568800926208496, - "kl": 0.1781005859375, - "learning_rate": 8.485722224954237e-06, - "loss": 0.0071, - "reward": 1.011160746216774, - "reward_std": 0.21007433533668518, - "rewards/accuracy_reward": 0.047619049437344074, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.9605655074119568, + "completion_length": 1427.0833740234375, + "epoch": 0.05878236529041288, + "grad_norm": 0.21863999962806702, + "kl": 0.0118255615234375, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.0005, + "reward": 0.3630952462553978, + "reward_std": 0.4389759972691536, + "rewards/accuracy_reward": 0.11904762149788439, + "rewards/format_reward": 0.008928571594879031, + "rewards/tag_count_reward": 0.2351190522313118, "step": 21 }, { - "completion_length": 655.8839263916016, - "epoch": 0.6153846153846154, - "grad_norm": 0.4743092358112335, - "kl": 0.1146240234375, - "learning_rate": 7.493474677412795e-06, - "loss": 0.0046, - "reward": 1.005952388048172, - "reward_std": 0.21222712844610214, - "rewards/accuracy_reward": 0.047619049437344074, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.958333358168602, + "completion_length": 1525.982177734375, + "epoch": 0.0615815255423373, + "grad_norm": 0.21818268299102783, + "kl": 0.0203399658203125, + "learning_rate": 1.2222222222222224e-05, + "loss": 0.0008, + "reward": 0.4136904776096344, + "reward_std": 0.47738203406333923, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.008928571594879031, + "rewards/tag_count_reward": 0.3244047686457634, "step": 22 }, { - "completion_length": 635.5000076293945, - "epoch": 0.6433566433566433, - "grad_norm": 0.1795119047164917, - "kl": 0.0771484375, - "learning_rate": 6.526947471551799e-06, - "loss": 0.0031, - "reward": 1.0587798058986664, - "reward_std": 0.17107740975916386, - "rewards/accuracy_reward": 0.08333333441987634, + "completion_length": 1605.3154602050781, + "epoch": 0.06438068579426172, + "grad_norm": 0.24474452435970306, + "kl": 0.017608642578125, + "learning_rate": 1.2777777777777777e-05, + "loss": 0.0007, + "reward": 0.3630952462553978, + "reward_std": 0.47906987369060516, + "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.9724702537059784, + "rewards/tag_count_reward": 0.2886904776096344, "step": 23 }, { - "completion_length": 542.9970245361328, - "epoch": 0.6713286713286714, - "grad_norm": 0.4369926452636719, - "kl": 0.091552734375, - "learning_rate": 5.5960584844236565e-06, - "loss": 0.0037, - "reward": 1.0498511791229248, - "reward_std": 0.19962026178836823, - "rewards/accuracy_reward": 0.08630952518433332, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416716337204, + "completion_length": 1668.9107666015625, + "epoch": 0.06717984604618614, + "grad_norm": 0.1840226799249649, + "kl": 0.018035888671875, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0007, + "reward": 0.5238095447421074, + "reward_std": 0.5313013195991516, + "rewards/accuracy_reward": 0.12797619262710214, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.3898809626698494, "step": 24 }, { - "completion_length": 475.51786041259766, - "epoch": 0.6993006993006993, - "grad_norm": 0.24868783354759216, - "kl": 0.087890625, - "learning_rate": 4.710359896730379e-06, - "loss": 0.0035, - "reward": 1.0379464328289032, - "reward_std": 0.1798675712198019, - "rewards/accuracy_reward": 0.06845238199457526, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9694940596818924, + "completion_length": 1752.3839721679688, + "epoch": 0.06997900629811056, + "grad_norm": 0.11642763018608093, + "kl": 0.009307861328125, + "learning_rate": 1.388888888888889e-05, + "loss": 0.0004, + "reward": 0.534970261156559, + "reward_std": 0.5106568336486816, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.427827388048172, "step": 25 }, { - "completion_length": 630.077392578125, - "epoch": 0.7272727272727273, - "grad_norm": 0.48154985904693604, - "kl": 0.08990478515625, - "learning_rate": 3.878940174523371e-06, - "loss": 0.0036, - "reward": 1.049107164144516, - "reward_std": 0.22189214080572128, - "rewards/accuracy_reward": 0.07440476515330374, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.9717262089252472, + "completion_length": 1637.58935546875, + "epoch": 0.072778166550035, + "grad_norm": 0.1400088667869568, + "kl": 0.017242431640625, + "learning_rate": 1.4444444444444446e-05, + "loss": 0.0007, + "reward": 0.645833358168602, + "reward_std": 0.5355847254395485, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.4940476268529892, "step": 26 }, { - "completion_length": 585.3452529907227, - "epoch": 0.7552447552447552, - "grad_norm": 0.4356919825077057, - "kl": 0.1033935546875, - "learning_rate": 3.110330809243134e-06, - "loss": 0.0041, - "reward": 1.0513392984867096, - "reward_std": 0.1829261239618063, - "rewards/accuracy_reward": 0.06845238176174462, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9828869253396988, + "completion_length": 1721.8244323730469, + "epoch": 0.07557732680195942, + "grad_norm": 0.13154961168766022, + "kl": 0.0110626220703125, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0004, + "reward": 0.549851194024086, + "reward_std": 0.4883287250995636, + "rewards/accuracy_reward": 0.056547620333731174, + "rewards/format_reward": 0.0029761905316263437, + "rewards/tag_count_reward": 0.490327388048172, "step": 27 }, { - "completion_length": 895.6904907226562, - "epoch": 0.7832167832167832, - "grad_norm": 0.16385550796985626, - "kl": 0.05230712890625, - "learning_rate": 2.4124187730720916e-06, - "loss": 0.0021, - "reward": 1.1116071343421936, - "reward_std": 0.23503300920128822, - "rewards/accuracy_reward": 0.1339285746216774, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9776785969734192, + "completion_length": 1652.2440795898438, + "epoch": 0.07837648705388384, + "grad_norm": 0.15250514447689056, + "kl": 0.0216064453125, + "learning_rate": 1.555555555555556e-05, + "loss": 0.0009, + "reward": 0.6726190596818924, + "reward_std": 0.5264899879693985, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.008928571594879031, + "rewards/tag_count_reward": 0.5744047611951828, "step": 28 }, { - "completion_length": 850.2410888671875, - "epoch": 0.8111888111888111, - "grad_norm": 0.17005112767219543, - "kl": 0.04559326171875, - "learning_rate": 1.7923655879272395e-06, - "loss": 0.0018, - "reward": 1.0565476417541504, - "reward_std": 0.18085119873285294, - "rewards/accuracy_reward": 0.07440476398915052, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9821428805589676, + "completion_length": 1634.0952453613281, + "epoch": 0.08117564730580826, + "grad_norm": 0.3804445266723633, + "kl": 0.0277862548828125, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.0011, + "reward": 0.662202388048172, + "reward_std": 0.49240704625844955, + "rewards/accuracy_reward": 0.05952381086535752, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.596726194024086, "step": 29 }, { - "completion_length": 919.1726379394531, - "epoch": 0.8391608391608392, - "grad_norm": 0.20661063492298126, - "kl": 0.070068359375, - "learning_rate": 1.2565338385541792e-06, - "loss": 0.0028, - "reward": 1.0647321939468384, - "reward_std": 0.1509323362261057, - "rewards/accuracy_reward": 0.08035714458674192, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750149011612, + "completion_length": 1684.4910888671875, + "epoch": 0.08397480755773268, + "grad_norm": 0.13322682678699493, + "kl": 0.015289306640625, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0006, + "reward": 0.790178582072258, + "reward_std": 0.5224584564566612, + "rewards/accuracy_reward": 0.1190476231276989, + "rewards/format_reward": 0.008928571594879031, + "rewards/tag_count_reward": 0.662202388048172, "step": 30 }, { - "completion_length": 1035.3779907226562, - "epoch": 0.8671328671328671, - "grad_norm": 0.25298818945884705, - "kl": 0.05023193359375, - "learning_rate": 8.10421883797694e-07, - "loss": 0.002, - "reward": 1.0952381193637848, - "reward_std": 0.24173467233777046, - "rewards/accuracy_reward": 0.1190476231276989, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9761904925107956, + "completion_length": 1526.4494323730469, + "epoch": 0.0867739678096571, + "grad_norm": 0.25503799319267273, + "kl": 0.021148681640625, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.0008, + "reward": 0.7819940745830536, + "reward_std": 0.4823927879333496, + "rewards/accuracy_reward": 0.09226190764456987, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.683779776096344, "step": 31 }, { - "completion_length": 1049.2410888671875, - "epoch": 0.8951048951048951, - "grad_norm": 1.1570007801055908, - "kl": 0.059417724609375, - "learning_rate": 4.5860743599951186e-07, - "loss": 0.0024, - "reward": 1.2172619104385376, - "reward_std": 0.25820935517549515, - "rewards/accuracy_reward": 0.23214285634458065, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9851190745830536, + "completion_length": 1563.5595397949219, + "epoch": 0.08957312806158152, + "grad_norm": 0.33548209071159363, + "kl": 0.0294189453125, + "learning_rate": 1.7777777777777777e-05, + "loss": 0.0012, + "reward": 0.8266369104385376, + "reward_std": 0.4193369373679161, + "rewards/accuracy_reward": 0.07440476445481181, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.7462797611951828, "step": 32 }, { - "completion_length": 1168.592269897461, - "epoch": 0.9230769230769231, - "grad_norm": 0.19920110702514648, - "kl": 0.036407470703125, - "learning_rate": 2.0470058747505516e-07, - "loss": 0.0015, - "reward": 1.0907738506793976, - "reward_std": 0.2389066442847252, - "rewards/accuracy_reward": 0.11904762033373117, - "rewards/format_reward": 0.0029761905316263437, - "rewards/tag_count_reward": 0.9687500149011612, + "completion_length": 1314.2440795898438, + "epoch": 0.09237228831350595, + "grad_norm": 0.1565406769514084, + "kl": 0.0249786376953125, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.001, + "reward": 0.8802083432674408, + "reward_std": 0.449076771736145, + "rewards/accuracy_reward": 0.10714285913854837, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7730654925107956, "step": 33 }, { - "completion_length": 906.2440643310547, - "epoch": 0.951048951048951, - "grad_norm": 0.9165655374526978, - "kl": 0.09033203125, - "learning_rate": 5.1306766081048456e-08, - "loss": 0.0036, - "reward": 1.1302083432674408, - "reward_std": 0.22381277196109295, - "rewards/accuracy_reward": 0.15773809468373656, + "completion_length": 1388.1131591796875, + "epoch": 0.09517144856543037, + "grad_norm": 0.6069921255111694, + "kl": 0.02899169921875, + "learning_rate": 1.888888888888889e-05, + "loss": 0.0012, + "reward": 0.8735119253396988, + "reward_std": 0.4563302770256996, + "rewards/accuracy_reward": 0.09821428777649999, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9724702686071396, + "rewards/tag_count_reward": 0.7752976417541504, "step": 34 }, { - "completion_length": 1167.5684509277344, - "epoch": 0.9790209790209791, - "grad_norm": 0.1901860535144806, - "kl": 0.044952392578125, - "learning_rate": 0.0, - "loss": 0.0018, - "reward": 1.06324402987957, - "reward_std": 0.22568430751562119, - "rewards/accuracy_reward": 0.08333333604969084, + "completion_length": 1307.1131286621094, + "epoch": 0.0979706088173548, + "grad_norm": 0.19676795601844788, + "kl": 0.026031494140625, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.001, + "reward": 0.9174107164144516, + "reward_std": 0.32133132219314575, + "rewards/accuracy_reward": 0.06845238339155912, "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9799107313156128, + "rewards/tag_count_reward": 0.8489583432674408, "step": 35 }, { - "epoch": 0.9790209790209791, - "step": 35, + "completion_length": 1179.8333740234375, + "epoch": 0.10076976906927922, + "grad_norm": 0.17550034821033478, + "kl": 0.0313720703125, + "learning_rate": 2e-05, + "loss": 0.0013, + "reward": 1.0200892984867096, + "reward_std": 0.4127752333879471, + "rewards/accuracy_reward": 0.19047619588673115, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.8236607313156128, + "step": 36 + }, + { + "completion_length": 1442.8303833007812, + "epoch": 0.10356892932120364, + "grad_norm": 1.053062915802002, + "kl": 0.066497802734375, + "learning_rate": 1.9999521087449523e-05, + "loss": 0.0027, + "reward": 0.8950892984867096, + "reward_std": 0.4056769236922264, + "rewards/accuracy_reward": 0.08035714505240321, + "rewards/format_reward": 0.0059523810632526875, + "rewards/tag_count_reward": 0.8087797909975052, + "step": 37 + }, + { + "completion_length": 1242.2083740234375, + "epoch": 0.10636808957312806, + "grad_norm": 0.3614981174468994, + "kl": 0.03375244140625, + "learning_rate": 1.9998084395669537e-05, + "loss": 0.0014, + "reward": 0.9456845223903656, + "reward_std": 0.3580349460244179, + "rewards/accuracy_reward": 0.08333333535119891, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8623512089252472, + "step": 38 + }, + { + "completion_length": 1440.8274230957031, + "epoch": 0.10916724982505248, + "grad_norm": 0.19032098352909088, + "kl": 0.0296630859375, + "learning_rate": 1.9995690062269985e-05, + "loss": 0.0012, + "reward": 0.9226190596818924, + "reward_std": 0.3531223088502884, + "rewards/accuracy_reward": 0.06547619309276342, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.8571428805589676, + "step": 39 + }, + { + "completion_length": 1399.6636962890625, + "epoch": 0.11196641007697691, + "grad_norm": 1.1726051568984985, + "kl": 0.156707763671875, + "learning_rate": 1.9992338316586132e-05, + "loss": 0.0062, + "reward": 0.865327388048172, + "reward_std": 0.43585436791181564, + "rewards/accuracy_reward": 0.08630952634848654, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7790178656578064, + "step": 40 + }, + { + "completion_length": 1369.9940795898438, + "epoch": 0.11476557032890133, + "grad_norm": 0.4513668119907379, + "kl": 0.04132080078125, + "learning_rate": 1.9988029479656596e-05, + "loss": 0.0017, + "reward": 0.8816964477300644, + "reward_std": 0.4034172296524048, + "rewards/accuracy_reward": 0.09226190764456987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7894345372915268, + "step": 41 + }, + { + "completion_length": 1270.7053833007812, + "epoch": 0.11756473058082575, + "grad_norm": 0.31328853964805603, + "kl": 0.06109619140625, + "learning_rate": 1.9982763964192586e-05, + "loss": 0.0024, + "reward": 0.8377976417541504, + "reward_std": 0.41212867200374603, + "rewards/accuracy_reward": 0.07738095428794622, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7604166716337204, + "step": 42 + }, + { + "completion_length": 1071.184555053711, + "epoch": 0.12036389083275018, + "grad_norm": 0.9629775285720825, + "kl": 0.1064453125, + "learning_rate": 1.9976542274538394e-05, + "loss": 0.0043, + "reward": 0.8727678805589676, + "reward_std": 0.36533668637275696, + "rewards/accuracy_reward": 0.0863095261156559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7864583283662796, + "step": 43 + }, + { + "completion_length": 1052.4880981445312, + "epoch": 0.1231630510846746, + "grad_norm": 0.4314608573913574, + "kl": 0.156005859375, + "learning_rate": 1.9969365006623072e-05, + "loss": 0.0062, + "reward": 0.8683035969734192, + "reward_std": 0.40765517204999924, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7522321492433548, + "step": 44 + }, + { + "completion_length": 904.7143096923828, + "epoch": 0.12596221133659902, + "grad_norm": 0.8732027411460876, + "kl": 0.296875, + "learning_rate": 1.996123284790336e-05, + "loss": 0.0119, + "reward": 0.8043155074119568, + "reward_std": 0.3967125192284584, + "rewards/accuracy_reward": 0.06845238339155912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.7358631044626236, + "step": 45 + }, + { + "completion_length": 961.8631286621094, + "epoch": 0.12876137158852344, + "grad_norm": 3.4988186359405518, + "kl": 0.91455078125, + "learning_rate": 1.9952146577297827e-05, + "loss": 0.0366, + "reward": 0.7328869253396988, + "reward_std": 0.3989041745662689, + "rewards/accuracy_reward": 0.05059523927047849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6822916716337204, + "step": 46 + }, + { + "completion_length": 1308.2262268066406, + "epoch": 0.13156053184044786, + "grad_norm": 2.489703893661499, + "kl": 0.60693359375, + "learning_rate": 1.9942107065112286e-05, + "loss": 0.0243, + "reward": 0.5997024178504944, + "reward_std": 0.4082387238740921, + "rewards/accuracy_reward": 0.014880952658131719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5848214477300644, + "step": 47 + }, + { + "completion_length": 1292.3750305175781, + "epoch": 0.13435969209237228, + "grad_norm": 0.8369948267936707, + "kl": 0.52001953125, + "learning_rate": 1.9931115272956405e-05, + "loss": 0.0208, + "reward": 0.5394345298409462, + "reward_std": 0.4319700449705124, + "rewards/accuracy_reward": 0.023809524485841393, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5156250149011612, + "step": 48 + }, + { + "completion_length": 1508.2530212402344, + "epoch": 0.1371588523442967, + "grad_norm": 0.8183385133743286, + "kl": 0.40283203125, + "learning_rate": 1.9919172253651637e-05, + "loss": 0.0161, + "reward": 0.6175595372915268, + "reward_std": 0.493251696228981, + "rewards/accuracy_reward": 0.06845238246023655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5491071492433548, + "step": 49 + }, + { + "completion_length": 1316.7054138183594, + "epoch": 0.13995801259622112, + "grad_norm": 0.6826545000076294, + "kl": 0.453125, + "learning_rate": 1.9906279151130338e-05, + "loss": 0.0181, + "reward": 0.6547619104385376, + "reward_std": 0.46080518513917923, + "rewards/accuracy_reward": 0.038690477376803756, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6160714328289032, + "step": 50 + }, + { + "completion_length": 1436.3869323730469, + "epoch": 0.14275717284814557, + "grad_norm": 0.7981727123260498, + "kl": 0.47705078125, + "learning_rate": 1.989243720032624e-05, + "loss": 0.0191, + "reward": 0.783482164144516, + "reward_std": 0.45741092413663864, + "rewards/accuracy_reward": 0.09821428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6852678507566452, + "step": 51 + }, + { + "completion_length": 1488.9881286621094, + "epoch": 0.14555633310007, + "grad_norm": 2.580409288406372, + "kl": 1.0712890625, + "learning_rate": 1.987764772705613e-05, + "loss": 0.0428, + "reward": 0.8020833432674408, + "reward_std": 0.45223573595285416, + "rewards/accuracy_reward": 0.11904762033373117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6830357313156128, + "step": 52 + }, + { + "completion_length": 1470.5744018554688, + "epoch": 0.1483554933519944, + "grad_norm": 2.836758852005005, + "kl": 0.8916015625, + "learning_rate": 1.9861912147892884e-05, + "loss": 0.0357, + "reward": 0.7693452537059784, + "reward_std": 0.37412961572408676, + "rewards/accuracy_reward": 0.07142857182770967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6979166716337204, + "step": 53 + }, + { + "completion_length": 1499.5505981445312, + "epoch": 0.15115465360391883, + "grad_norm": 9.416851997375488, + "kl": 3.708984375, + "learning_rate": 1.9845231970029774e-05, + "loss": 0.1484, + "reward": 0.802083358168602, + "reward_std": 0.38054975122213364, + "rewards/accuracy_reward": 0.11904762149788439, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6830357313156128, + "step": 54 + }, + { + "completion_length": 1472.77685546875, + "epoch": 0.15395381385584325, + "grad_norm": 9.983410835266113, + "kl": 1.068359375, + "learning_rate": 1.98276087911361e-05, + "loss": 0.0428, + "reward": 0.783482164144516, + "reward_std": 0.46770229935646057, + "rewards/accuracy_reward": 0.14583333604969084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6376488208770752, + "step": 55 + }, + { + "completion_length": 1379.4494323730469, + "epoch": 0.15675297410776767, + "grad_norm": 32.2557373046875, + "kl": 8.84375, + "learning_rate": 1.9809044299204173e-05, + "loss": 0.3537, + "reward": 0.6636904776096344, + "reward_std": 0.4070945233106613, + "rewards/accuracy_reward": 0.05654762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.6071428656578064, + "step": 56 + }, + { + "completion_length": 1611.4137268066406, + "epoch": 0.1595521343596921, + "grad_norm": 4.396721839904785, + "kl": 0.712890625, + "learning_rate": 1.978954027238763e-05, + "loss": 0.0285, + "reward": 0.7351190745830536, + "reward_std": 0.4506368339061737, + "rewards/accuracy_reward": 0.16071429126895964, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.574404776096344, + "step": 57 + }, + { + "completion_length": 1649.7232666015625, + "epoch": 0.16235129461161651, + "grad_norm": 2.292630910873413, + "kl": 1.41015625, + "learning_rate": 1.9769098578831113e-05, + "loss": 0.0564, + "reward": 0.712053582072258, + "reward_std": 0.4804087057709694, + "rewards/accuracy_reward": 0.17857143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5334821417927742, + "step": 58 + }, + { + "completion_length": 1417.9166870117188, + "epoch": 0.16515045486354094, + "grad_norm": 5.817103862762451, + "kl": 1.359375, + "learning_rate": 1.974772117649135e-05, + "loss": 0.0543, + "reward": 0.6770833283662796, + "reward_std": 0.49610138684511185, + "rewards/accuracy_reward": 0.16666667070239782, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.510416679084301, + "step": 59 + }, + { + "completion_length": 1464.5059509277344, + "epoch": 0.16794961511546536, + "grad_norm": 17.89388084411621, + "kl": 6.7578125, + "learning_rate": 1.972541011294959e-05, + "loss": 0.2704, + "reward": 0.7537202388048172, + "reward_std": 0.49375829100608826, + "rewards/accuracy_reward": 0.2113095298409462, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5424107164144516, + "step": 60 + }, + { + "completion_length": 1358.7113037109375, + "epoch": 0.17074877536738978, + "grad_norm": 4.328094959259033, + "kl": 1.0810546875, + "learning_rate": 1.9702167525215504e-05, + "loss": 0.0432, + "reward": 0.6614583283662796, + "reward_std": 0.46823926270008087, + "rewards/accuracy_reward": 0.16964286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.491815485060215, + "step": 61 + }, + { + "completion_length": 1405.169677734375, + "epoch": 0.1735479356193142, + "grad_norm": 5.302358627319336, + "kl": 3.59765625, + "learning_rate": 1.9677995639522482e-05, + "loss": 0.1441, + "reward": 0.4947916641831398, + "reward_std": 0.38233522325754166, + "rewards/accuracy_reward": 0.06845238199457526, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.426339291036129, + "step": 62 + }, + { + "completion_length": 1460.8423156738281, + "epoch": 0.17634709587123862, + "grad_norm": 2.2617859840393066, + "kl": 3.359375, + "learning_rate": 1.9652896771114416e-05, + "loss": 0.1343, + "reward": 0.4441964328289032, + "reward_std": 0.4101327210664749, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3727678582072258, + "step": 63 + }, + { + "completion_length": 1583.0505981445312, + "epoch": 0.17914625612316304, + "grad_norm": 6.0778584480285645, + "kl": 2.34375, + "learning_rate": 1.9626873324023915e-05, + "loss": 0.0938, + "reward": 0.4516369178891182, + "reward_std": 0.40664728730916977, + "rewards/accuracy_reward": 0.08630952518433332, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.365327388048172, + "step": 64 + }, + { + "completion_length": 1629.2321472167969, + "epoch": 0.1819454163750875, + "grad_norm": 6.9924211502075195, + "kl": 3.369140625, + "learning_rate": 1.959992779084207e-05, + "loss": 0.1349, + "reward": 0.4933035746216774, + "reward_std": 0.4479127451777458, + "rewards/accuracy_reward": 0.14880952890962362, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3444940596818924, + "step": 65 + }, + { + "completion_length": 1934.0833435058594, + "epoch": 0.1847445766270119, + "grad_norm": 97.51799011230469, + "kl": 24.03125, + "learning_rate": 1.9572062752479684e-05, + "loss": 0.9617, + "reward": 0.4270833432674408, + "reward_std": 0.42346952110528946, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.322916679084301, + "step": 66 + }, + { + "completion_length": 2216.2529907226562, + "epoch": 0.18754373687893633, + "grad_norm": 9.205163955688477, + "kl": 2.97265625, + "learning_rate": 1.9543280877920073e-05, + "loss": 0.1189, + "reward": 0.3273809552192688, + "reward_std": 0.33986856043338776, + "rewards/accuracy_reward": 0.06547619309276342, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2619047649204731, + "step": 67 + }, + { + "completion_length": 2244.4256286621094, + "epoch": 0.19034289713086075, + "grad_norm": 3.074749231338501, + "kl": 4.1875, + "learning_rate": 1.9513584923963426e-05, + "loss": 0.1676, + "reward": 0.404761902987957, + "reward_std": 0.4150353893637657, + "rewards/accuracy_reward": 0.12202381156384945, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2827381007373333, + "step": 68 + }, + { + "completion_length": 1985.6458740234375, + "epoch": 0.19314205738278517, + "grad_norm": 7.568788528442383, + "kl": 0.978515625, + "learning_rate": 1.9482977734962753e-05, + "loss": 0.0392, + "reward": 0.3117559626698494, + "reward_std": 0.3240882083773613, + "rewards/accuracy_reward": 0.06845238199457526, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2433035746216774, + "step": 69 + }, + { + "completion_length": 2261.0565185546875, + "epoch": 0.1959412176347096, + "grad_norm": 2.400193452835083, + "kl": 1.630859375, + "learning_rate": 1.945146224255145e-05, + "loss": 0.0652, + "reward": 0.332589291036129, + "reward_std": 0.3499395176768303, + "rewards/accuracy_reward": 0.10714286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2254464365541935, + "step": 70 + }, + { + "completion_length": 2354.7976684570312, + "epoch": 0.198740377886634, + "grad_norm": 1.3778446912765503, + "kl": 1.369140625, + "learning_rate": 1.9419041465362477e-05, + "loss": 0.0547, + "reward": 0.2581845261156559, + "reward_std": 0.2820921465754509, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1867559552192688, + "step": 71 + }, + { + "completion_length": 2364.2560424804688, + "epoch": 0.20153953813855843, + "grad_norm": 2.740722417831421, + "kl": 0.39306640625, + "learning_rate": 1.9385718508739263e-05, + "loss": 0.0157, + "reward": 0.2440476194024086, + "reward_std": 0.25263551250100136, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1994047649204731, + "step": 72 + }, + { + "completion_length": 2364.806610107422, + "epoch": 0.20433869839048285, + "grad_norm": 1.480932593345642, + "kl": 0.3916015625, + "learning_rate": 1.9351496564438228e-05, + "loss": 0.0157, + "reward": 0.2522321492433548, + "reward_std": 0.2653038240969181, + "rewards/accuracy_reward": 0.06250000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1897321492433548, + "step": 73 + }, + { + "completion_length": 2264.5982971191406, + "epoch": 0.20713785864240727, + "grad_norm": 0.2756696939468384, + "kl": 0.31103515625, + "learning_rate": 1.93163789103231e-05, + "loss": 0.0124, + "reward": 0.2544642873108387, + "reward_std": 0.306577168405056, + "rewards/accuracy_reward": 0.0863095261156559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1681547649204731, + "step": 74 + }, + { + "completion_length": 1999.2411193847656, + "epoch": 0.2099370188943317, + "grad_norm": 0.6625251173973083, + "kl": 0.223876953125, + "learning_rate": 1.9280368910050943e-05, + "loss": 0.009, + "reward": 0.382440485060215, + "reward_std": 0.3074917793273926, + "rewards/accuracy_reward": 0.16369047947227955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2187500037252903, + "step": 75 + }, + { + "completion_length": 2024.6934814453125, + "epoch": 0.21273617914625612, + "grad_norm": 0.4294309616088867, + "kl": 0.1483154296875, + "learning_rate": 1.9243470012749968e-05, + "loss": 0.0059, + "reward": 0.2879464291036129, + "reward_std": 0.2326599396765232, + "rewards/accuracy_reward": 0.03869047691114247, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2492559552192688, + "step": 76 + }, + { + "completion_length": 1902.5327758789062, + "epoch": 0.21553533939818054, + "grad_norm": 0.1263069361448288, + "kl": 0.1097412109375, + "learning_rate": 1.9205685752689178e-05, + "loss": 0.0044, + "reward": 0.3236607164144516, + "reward_std": 0.3096480742096901, + "rewards/accuracy_reward": 0.08630952797830105, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.237351194024086, + "step": 77 + }, + { + "completion_length": 1739.0327453613281, + "epoch": 0.21833449965010496, + "grad_norm": 0.22840408980846405, + "kl": 0.1258544921875, + "learning_rate": 1.9167019748939847e-05, + "loss": 0.005, + "reward": 0.335565485060215, + "reward_std": 0.28444118052721024, + "rewards/accuracy_reward": 0.056547620333731174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2790178582072258, + "step": 78 + }, + { + "completion_length": 1673.0863647460938, + "epoch": 0.22113365990202938, + "grad_norm": 62.03584671020508, + "kl": 4.39373779296875, + "learning_rate": 1.9127475705028864e-05, + "loss": 0.176, + "reward": 0.4456845372915268, + "reward_std": 0.3754320666193962, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2849702388048172, + "step": 79 + }, + { + "completion_length": 1479.735107421875, + "epoch": 0.22393282015395383, + "grad_norm": 0.14882853627204895, + "kl": 0.077392578125, + "learning_rate": 1.908705740858402e-05, + "loss": 0.0031, + "reward": 0.4561012014746666, + "reward_std": 0.3437865376472473, + "rewards/accuracy_reward": 0.16071428917348385, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2953869141638279, + "step": 80 + }, + { + "completion_length": 1570.1071472167969, + "epoch": 0.22673198040587825, + "grad_norm": 0.14695590734481812, + "kl": 0.07684326171875, + "learning_rate": 1.9045768730971198e-05, + "loss": 0.0031, + "reward": 0.459077388048172, + "reward_std": 0.3266071379184723, + "rewards/accuracy_reward": 0.127976194024086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.331101194024086, + "step": 81 + }, + { + "completion_length": 1588.2976379394531, + "epoch": 0.22953114065780267, + "grad_norm": 0.21373598277568817, + "kl": 0.083251953125, + "learning_rate": 1.900361362692358e-05, + "loss": 0.0033, + "reward": 0.465029776096344, + "reward_std": 0.372719869017601, + "rewards/accuracy_reward": 0.14880952797830105, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3162202462553978, + "step": 82 + }, + { + "completion_length": 1511.952392578125, + "epoch": 0.2323303009097271, + "grad_norm": 0.48833397030830383, + "kl": 0.0877685546875, + "learning_rate": 1.8960596134162845e-05, + "loss": 0.0035, + "reward": 0.4680059626698494, + "reward_std": 0.377053365111351, + "rewards/accuracy_reward": 0.1369047649204731, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3311012014746666, + "step": 83 + }, + { + "completion_length": 1497.1011962890625, + "epoch": 0.2351294611616515, + "grad_norm": 1.3906663656234741, + "kl": 0.0849609375, + "learning_rate": 1.8916720373012425e-05, + "loss": 0.0034, + "reward": 0.3921131119132042, + "reward_std": 0.29465848952531815, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3385416716337204, + "step": 84 + }, + { + "completion_length": 1447.3541870117188, + "epoch": 0.23792862141357593, + "grad_norm": 3.8026514053344727, + "kl": 0.13427734375, + "learning_rate": 1.887199054600286e-05, + "loss": 0.0054, + "reward": 0.3802083432674408, + "reward_std": 0.3280208334326744, + "rewards/accuracy_reward": 0.07440476352348924, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.305803582072258, + "step": 85 + }, + { + "completion_length": 1325.6339721679688, + "epoch": 0.24072778166550035, + "grad_norm": 6.468008995056152, + "kl": 0.2919921875, + "learning_rate": 1.8826410937469256e-05, + "loss": 0.0117, + "reward": 0.3586309626698494, + "reward_std": 0.3128962069749832, + "rewards/accuracy_reward": 0.056547621032223105, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3020833358168602, + "step": 86 + }, + { + "completion_length": 1401.794677734375, + "epoch": 0.24352694191742477, + "grad_norm": 4.565969944000244, + "kl": 1.2275390625, + "learning_rate": 1.8779985913140927e-05, + "loss": 0.0491, + "reward": 0.3660714328289032, + "reward_std": 0.34038255363702774, + "rewards/accuracy_reward": 0.0922619067132473, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2738095335662365, + "step": 87 + }, + { + "completion_length": 1474.7649230957031, + "epoch": 0.2463261021693492, + "grad_norm": 6.203220844268799, + "kl": 1.876953125, + "learning_rate": 1.873271991972323e-05, + "loss": 0.0751, + "reward": 0.4382440522313118, + "reward_std": 0.40872056037187576, + "rewards/accuracy_reward": 0.16369047854095697, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2745535708963871, + "step": 88 + }, + { + "completion_length": 1377.1488342285156, + "epoch": 0.2491252624212736, + "grad_norm": 30.667940139770508, + "kl": 10.359375, + "learning_rate": 1.8684617484471662e-05, + "loss": 0.4151, + "reward": 0.3816964328289032, + "reward_std": 0.3514695316553116, + "rewards/accuracy_reward": 0.10714285960420966, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2745535746216774, + "step": 89 + }, + { + "completion_length": 1392.4940490722656, + "epoch": 0.25192442267319803, + "grad_norm": 2.3907365798950195, + "kl": 2.6015625, + "learning_rate": 1.8635683214758213e-05, + "loss": 0.1041, + "reward": 0.3474702462553978, + "reward_std": 0.3414006531238556, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.267113097012043, + "step": 90 + }, + { + "completion_length": 1432.6636962890625, + "epoch": 0.2547235829251225, + "grad_norm": 2.4524078369140625, + "kl": 2.2734375, + "learning_rate": 1.8585921797630064e-05, + "loss": 0.0908, + "reward": 0.3690476268529892, + "reward_std": 0.33869466185569763, + "rewards/accuracy_reward": 0.11607143003493547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2529762014746666, + "step": 91 + }, + { + "completion_length": 1742.982177734375, + "epoch": 0.2575227431770469, + "grad_norm": 14.129288673400879, + "kl": 7.234375, + "learning_rate": 1.8535337999360655e-05, + "loss": 0.2895, + "reward": 0.3162202388048172, + "reward_std": 0.28868982940912247, + "rewards/accuracy_reward": 0.04464285844005644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.271577388048172, + "step": 92 + }, + { + "completion_length": 1622.3392944335938, + "epoch": 0.2603219034289713, + "grad_norm": 11.640968322753906, + "kl": 5.56640625, + "learning_rate": 1.8483936664993152e-05, + "loss": 0.2229, + "reward": 0.3563988208770752, + "reward_std": 0.3877794221043587, + "rewards/accuracy_reward": 0.11011904943734407, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2462797686457634, + "step": 93 + }, + { + "completion_length": 1711.982177734375, + "epoch": 0.2631210636808957, + "grad_norm": 5.302284240722656, + "kl": 2.29296875, + "learning_rate": 1.8431722717876383e-05, + "loss": 0.0917, + "reward": 0.3742559626698494, + "reward_std": 0.3682831898331642, + "rewards/accuracy_reward": 0.11309524346143007, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2611607164144516, + "step": 94 + }, + { + "completion_length": 1736.8750305175781, + "epoch": 0.26592022393282017, + "grad_norm": 4.685518741607666, + "kl": 1.94921875, + "learning_rate": 1.837870115919327e-05, + "loss": 0.0779, + "reward": 0.3288690596818924, + "reward_std": 0.31656138598918915, + "rewards/accuracy_reward": 0.06547619216144085, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2633928619325161, + "step": 95 + }, + { + "completion_length": 1903.4315490722656, + "epoch": 0.26871938418474456, + "grad_norm": 11.830875396728516, + "kl": 6.375, + "learning_rate": 1.8324877067481782e-05, + "loss": 0.2549, + "reward": 0.2864583395421505, + "reward_std": 0.28968408703804016, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2418154813349247, + "step": 96 + }, + { + "completion_length": 2018.3988952636719, + "epoch": 0.271518544436669, + "grad_norm": 9.407161712646484, + "kl": 5.4921875, + "learning_rate": 1.8270255598148542e-05, + "loss": 0.2198, + "reward": 0.3221726194024086, + "reward_std": 0.3211013078689575, + "rewards/accuracy_reward": 0.07440476445481181, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2477678582072258, + "step": 97 + }, + { + "completion_length": 1971.5178527832031, + "epoch": 0.2743177046885934, + "grad_norm": 1.8556294441223145, + "kl": 2.181640625, + "learning_rate": 1.8214841982974975e-05, + "loss": 0.0872, + "reward": 0.2924107201397419, + "reward_std": 0.33721090108156204, + "rewards/accuracy_reward": 0.07440476305782795, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2180059552192688, + "step": 98 + }, + { + "completion_length": 2205.669677734375, + "epoch": 0.27711686494051785, + "grad_norm": 6.686934947967529, + "kl": 3.87109375, + "learning_rate": 1.815864152961624e-05, + "loss": 0.1549, + "reward": 0.2790178693830967, + "reward_std": 0.3240780681371689, + "rewards/accuracy_reward": 0.06250000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2165178619325161, + "step": 99 + }, + { + "completion_length": 1977.264892578125, + "epoch": 0.27991602519244224, + "grad_norm": 3.179499626159668, + "kl": 2.650390625, + "learning_rate": 1.8101659621092832e-05, + "loss": 0.1062, + "reward": 0.3653273805975914, + "reward_std": 0.3543194383382797, + "rewards/accuracy_reward": 0.10416666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2611607164144516, + "step": 100 + }, + { + "completion_length": 1955.6339721679688, + "epoch": 0.2827151854443667, + "grad_norm": 83.43329620361328, + "kl": 22.65625, + "learning_rate": 1.804390171527497e-05, + "loss": 0.9068, + "reward": 0.2425595261156559, + "reward_std": 0.28768592327833176, + "rewards/accuracy_reward": 0.059523812495172024, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1830357164144516, + "step": 101 + }, + { + "completion_length": 2183.482208251953, + "epoch": 0.28551434569629114, + "grad_norm": 28.742935180664062, + "kl": 10.9296875, + "learning_rate": 1.798537334435986e-05, + "loss": 0.4374, + "reward": 0.2656250037252903, + "reward_std": 0.31604088470339775, + "rewards/accuracy_reward": 0.0744047632906586, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1912202388048172, + "step": 102 + }, + { + "completion_length": 2065.9732055664062, + "epoch": 0.28831350594821553, + "grad_norm": 5.615250110626221, + "kl": 1.46484375, + "learning_rate": 1.792608011434178e-05, + "loss": 0.0586, + "reward": 0.2604166679084301, + "reward_std": 0.3304464593529701, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1889881007373333, + "step": 103 + }, + { + "completion_length": 2101.6755981445312, + "epoch": 0.29111266620014, + "grad_norm": 4.398268222808838, + "kl": 1.478515625, + "learning_rate": 1.786602770447513e-05, + "loss": 0.0591, + "reward": 0.2730654813349247, + "reward_std": 0.33250241726636887, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.1897321417927742, + "step": 104 + }, + { + "completion_length": 2183.6785888671875, + "epoch": 0.2939118264520644, + "grad_norm": 2.4567322731018066, + "kl": 3.703125, + "learning_rate": 1.780522186673046e-05, + "loss": 0.1481, + "reward": 0.2976190522313118, + "reward_std": 0.3433954492211342, + "rewards/accuracy_reward": 0.06250000139698386, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2351190522313118, + "step": 105 + }, + { + "completion_length": 2175.7500610351562, + "epoch": 0.2967109867039888, + "grad_norm": 8.868399620056152, + "kl": 9.59375, + "learning_rate": 1.7743668425243547e-05, + "loss": 0.3843, + "reward": 0.2849702499806881, + "reward_std": 0.35335545986890793, + "rewards/accuracy_reward": 0.05654761986806989, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2284226231276989, + "step": 106 + }, + { + "completion_length": 2245.6934509277344, + "epoch": 0.2995101469559132, + "grad_norm": 8.676541328430176, + "kl": 8.109375, + "learning_rate": 1.768137327575751e-05, + "loss": 0.3248, + "reward": 0.2686011977493763, + "reward_std": 0.32574551552534103, + "rewards/accuracy_reward": 0.04166666814126074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2269345335662365, + "step": 107 + }, + { + "completion_length": 2097.5357666015625, + "epoch": 0.30230930720783766, + "grad_norm": 2.091578722000122, + "kl": 3.2734375, + "learning_rate": 1.7618342385058147e-05, + "loss": 0.1307, + "reward": 0.3258928582072258, + "reward_std": 0.3488500714302063, + "rewards/accuracy_reward": 0.10119047947227955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.224702388048172, + "step": 108 + }, + { + "completion_length": 2125.3155212402344, + "epoch": 0.30510846745976206, + "grad_norm": 3.9262006282806396, + "kl": 2.7578125, + "learning_rate": 1.7554581790402372e-05, + "loss": 0.1105, + "reward": 0.3385416716337204, + "reward_std": 0.38373828679323196, + "rewards/accuracy_reward": 0.06547619285993278, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2730654813349247, + "step": 109 + }, + { + "completion_length": 2063.9375610351562, + "epoch": 0.3079076277116865, + "grad_norm": 2.227835178375244, + "kl": 3.26171875, + "learning_rate": 1.749009759893999e-05, + "loss": 0.1303, + "reward": 0.3816964253783226, + "reward_std": 0.417699970304966, + "rewards/accuracy_reward": 0.10119047993794084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2805059626698494, + "step": 110 + }, + { + "completion_length": 1998.9286193847656, + "epoch": 0.3107067879636109, + "grad_norm": 17.66851806640625, + "kl": 12.578125, + "learning_rate": 1.7424895987128723e-05, + "loss": 0.5034, + "reward": 0.3593750074505806, + "reward_std": 0.38737890124320984, + "rewards/accuracy_reward": 0.07738095638342202, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2819940522313118, + "step": 111 + }, + { + "completion_length": 1848.2440795898438, + "epoch": 0.31350594821553535, + "grad_norm": 32.354515075683594, + "kl": 16.15625, + "learning_rate": 1.7358983200142608e-05, + "loss": 0.646, + "reward": 0.4627976343035698, + "reward_std": 0.4812953993678093, + "rewards/accuracy_reward": 0.1488095261156559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3139881044626236, + "step": 112 + }, + { + "completion_length": 1820.3452758789062, + "epoch": 0.31630510846745974, + "grad_norm": 3.6990959644317627, + "kl": 6.578125, + "learning_rate": 1.7292365551273835e-05, + "loss": 0.2631, + "reward": 0.3720238208770752, + "reward_std": 0.34715334326028824, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3184523954987526, + "step": 113 + }, + { + "completion_length": 1722.5952758789062, + "epoch": 0.3191042687193842, + "grad_norm": 3.8126943111419678, + "kl": 3.0234375, + "learning_rate": 1.7225049421328024e-05, + "loss": 0.1211, + "reward": 0.393601194024086, + "reward_std": 0.37151359021663666, + "rewards/accuracy_reward": 0.04761904873885214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3459821417927742, + "step": 114 + }, + { + "completion_length": 1612.1726379394531, + "epoch": 0.3219034289713086, + "grad_norm": 3.412381172180176, + "kl": 2.306640625, + "learning_rate": 1.7157041258013074e-05, + "loss": 0.0923, + "reward": 0.4449404776096344, + "reward_std": 0.39042579382658005, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3407738134264946, + "step": 115 + }, + { + "completion_length": 1911.6279907226562, + "epoch": 0.32470258922323303, + "grad_norm": 2.633307456970215, + "kl": 6.6015625, + "learning_rate": 1.7088347575321575e-05, + "loss": 0.2641, + "reward": 0.3162202462553978, + "reward_std": 0.33684205263853073, + "rewards/accuracy_reward": 0.029761906014755368, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2864583320915699, + "step": 116 + }, + { + "completion_length": 1748.3660888671875, + "epoch": 0.3275017494751575, + "grad_norm": 4.045055866241455, + "kl": 6.953125, + "learning_rate": 1.7018974952906885e-05, + "loss": 0.2784, + "reward": 0.4233631044626236, + "reward_std": 0.41040540486574173, + "rewards/accuracy_reward": 0.06845238339155912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3549107164144516, + "step": 117 + }, + { + "completion_length": 1700.4226684570312, + "epoch": 0.33030090972708187, + "grad_norm": 2.454699993133545, + "kl": 5.203125, + "learning_rate": 1.6948930035452905e-05, + "loss": 0.2082, + "reward": 0.3586309626698494, + "reward_std": 0.35025156289339066, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3318452462553978, + "step": 118 + }, + { + "completion_length": 1640.9940490722656, + "epoch": 0.3331000699790063, + "grad_norm": 2.981011390686035, + "kl": 3.091796875, + "learning_rate": 1.687821953203765e-05, + "loss": 0.1235, + "reward": 0.426339291036129, + "reward_std": 0.3910770118236542, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3549107164144516, + "step": 119 + }, + { + "completion_length": 1564.9256286621094, + "epoch": 0.3358992302309307, + "grad_norm": 2.033478021621704, + "kl": 3.33203125, + "learning_rate": 1.680685021549063e-05, + "loss": 0.1333, + "reward": 0.5133928656578064, + "reward_std": 0.42733435332775116, + "rewards/accuracy_reward": 0.12500000465661287, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3883928582072258, + "step": 120 + }, + { + "completion_length": 1634.2976379394531, + "epoch": 0.33869839048285516, + "grad_norm": 9.406937599182129, + "kl": 7.015625, + "learning_rate": 1.6734828921744127e-05, + "loss": 0.2812, + "reward": 0.4561012014746666, + "reward_std": 0.3823869004845619, + "rewards/accuracy_reward": 0.05952381156384945, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396577388048172, + "step": 121 + }, + { + "completion_length": 1681.1488342285156, + "epoch": 0.34149755073477955, + "grad_norm": 12.184917449951172, + "kl": 8.234375, + "learning_rate": 1.6662162549178433e-05, + "loss": 0.3295, + "reward": 0.3571428582072258, + "reward_std": 0.3287462666630745, + "rewards/accuracy_reward": 0.011904762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.345238097012043, + "step": 122 + }, + { + "completion_length": 1526.9613037109375, + "epoch": 0.344296710986704, + "grad_norm": 3.4144787788391113, + "kl": 3.984375, + "learning_rate": 1.658885805796111e-05, + "loss": 0.1595, + "reward": 0.4486607238650322, + "reward_std": 0.3446625769138336, + "rewards/accuracy_reward": 0.08035714458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.368303582072258, + "step": 123 + }, + { + "completion_length": 1342.1904907226562, + "epoch": 0.3470958712386284, + "grad_norm": 3.6479732990264893, + "kl": 1.447265625, + "learning_rate": 1.651492246938034e-05, + "loss": 0.058, + "reward": 0.5, + "reward_std": 0.37430670112371445, + "rewards/accuracy_reward": 0.11607143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.383928582072258, + "step": 124 + }, + { + "completion_length": 1448.7232360839844, + "epoch": 0.34989503149055284, + "grad_norm": 5.434635639190674, + "kl": 1.8642578125, + "learning_rate": 1.6440362865172373e-05, + "loss": 0.0745, + "reward": 0.4642857238650322, + "reward_std": 0.37097910791635513, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.380952388048172, + "step": 125 + }, + { + "completion_length": 1421.0089416503906, + "epoch": 0.35269419174247724, + "grad_norm": 2.62148380279541, + "kl": 3.39453125, + "learning_rate": 1.636518638684325e-05, + "loss": 0.1358, + "reward": 0.4159226194024086, + "reward_std": 0.3015677332878113, + "rewards/accuracy_reward": 0.023809524485841393, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3921131044626236, + "step": 126 + }, + { + "completion_length": 1384.3363342285156, + "epoch": 0.3554933519944017, + "grad_norm": 25.959407806396484, + "kl": 9.265625, + "learning_rate": 1.628940023498477e-05, + "loss": 0.3708, + "reward": 0.4866071417927742, + "reward_std": 0.37376558035612106, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357238650322, + "step": 127 + }, + { + "completion_length": 1338.4583435058594, + "epoch": 0.3582925122463261, + "grad_norm": 25.4382381439209, + "kl": 10.234375, + "learning_rate": 1.621301166858479e-05, + "loss": 0.409, + "reward": 0.4806547686457634, + "reward_std": 0.3681168407201767, + "rewards/accuracy_reward": 0.08928571757860482, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3913690522313118, + "step": 128 + }, + { + "completion_length": 1237.8750305175781, + "epoch": 0.3610916724982505, + "grad_norm": 5.374622821807861, + "kl": 4.0546875, + "learning_rate": 1.613602800433194e-05, + "loss": 0.162, + "reward": 0.4538690447807312, + "reward_std": 0.34278959035873413, + "rewards/accuracy_reward": 0.09226190764456987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3616071492433548, + "step": 129 + }, + { + "completion_length": 1252.1310119628906, + "epoch": 0.363890832750175, + "grad_norm": 2.945963144302368, + "kl": 2.73828125, + "learning_rate": 1.6058456615914815e-05, + "loss": 0.1096, + "reward": 0.471726194024086, + "reward_std": 0.3569194823503494, + "rewards/accuracy_reward": 0.10119047854095697, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357164144516, + "step": 130 + }, + { + "completion_length": 1219.3065490722656, + "epoch": 0.36668999300209937, + "grad_norm": 6.008285999298096, + "kl": 0.9697265625, + "learning_rate": 1.598030493331572e-05, + "loss": 0.0388, + "reward": 0.5855654776096344, + "reward_std": 0.40885039418935776, + "rewards/accuracy_reward": 0.16964286100119352, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4159226343035698, + "step": 131 + }, + { + "completion_length": 1185.7440795898438, + "epoch": 0.3694891532540238, + "grad_norm": 4.888062000274658, + "kl": 1.896484375, + "learning_rate": 1.590158044209897e-05, + "loss": 0.0758, + "reward": 0.485863097012043, + "reward_std": 0.34774496406316757, + "rewards/accuracy_reward": 0.07440476398915052, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4114583432674408, + "step": 132 + }, + { + "completion_length": 1237.4285888671875, + "epoch": 0.3722883135059482, + "grad_norm": 11.741883277893066, + "kl": 4.89453125, + "learning_rate": 1.5822290682693944e-05, + "loss": 0.1959, + "reward": 0.5238095298409462, + "reward_std": 0.39101071655750275, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4196428656578064, + "step": 133 + }, + { + "completion_length": 1146.7053833007812, + "epoch": 0.37508747375787266, + "grad_norm": 6.2459235191345215, + "kl": 3.640625, + "learning_rate": 1.574244324967283e-05, + "loss": 0.1455, + "reward": 0.6093750298023224, + "reward_std": 0.43268976360559464, + "rewards/accuracy_reward": 0.16071428824216127, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4486607238650322, + "step": 134 + }, + { + "completion_length": 1258.1607360839844, + "epoch": 0.37788663400979705, + "grad_norm": 2.578888177871704, + "kl": 1.982421875, + "learning_rate": 1.566204579102317e-05, + "loss": 0.0792, + "reward": 0.5520833358168602, + "reward_std": 0.35485880821943283, + "rewards/accuracy_reward": 0.10416666883975267, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.447916679084301, + "step": 135 + }, + { + "completion_length": 1309.71728515625, + "epoch": 0.3806857942617215, + "grad_norm": 7.092722415924072, + "kl": 4.0, + "learning_rate": 1.5581106007415382e-05, + "loss": 0.1602, + "reward": 0.4806547686457634, + "reward_std": 0.34185463935136795, + "rewards/accuracy_reward": 0.05357142933644354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4270833358168602, + "step": 136 + }, + { + "completion_length": 1118.5774230957031, + "epoch": 0.3834849545136459, + "grad_norm": 2.3951456546783447, + "kl": 1.703125, + "learning_rate": 1.5499631651465086e-05, + "loss": 0.0681, + "reward": 0.5334821566939354, + "reward_std": 0.34388598799705505, + "rewards/accuracy_reward": 0.07738095382228494, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.456101194024086, + "step": 137 + }, + { + "completion_length": 1529.0744323730469, + "epoch": 0.38628411476557034, + "grad_norm": 5.085093975067139, + "kl": 3.71484375, + "learning_rate": 1.5417630526990613e-05, + "loss": 0.1485, + "reward": 0.476190485060215, + "reward_std": 0.3736257702112198, + "rewards/accuracy_reward": 0.06845238246023655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4077381044626236, + "step": 138 + }, + { + "completion_length": 1327.3303680419922, + "epoch": 0.38908327501749473, + "grad_norm": 1.0672588348388672, + "kl": 1.56591796875, + "learning_rate": 1.5335110488265497e-05, + "loss": 0.0625, + "reward": 0.5483631119132042, + "reward_std": 0.39235763996839523, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4769345372915268, + "step": 139 + }, + { + "completion_length": 920.1905059814453, + "epoch": 0.3918824352694192, + "grad_norm": 1.9296761751174927, + "kl": 1.2568359375, + "learning_rate": 1.5252079439266179e-05, + "loss": 0.0504, + "reward": 0.6302083358168602, + "reward_std": 0.3486922085285187, + "rewards/accuracy_reward": 0.09226190857589245, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5379464402794838, + "step": 140 + }, + { + "completion_length": 1191.7410888671875, + "epoch": 0.3946815955213436, + "grad_norm": 2.825702428817749, + "kl": 2.25, + "learning_rate": 1.5168545332914942e-05, + "loss": 0.0899, + "reward": 0.554315485060215, + "reward_std": 0.3511172980070114, + "rewards/accuracy_reward": 0.05654761986806989, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4977678582072258, + "step": 141 + }, + { + "completion_length": 1211.2738342285156, + "epoch": 0.397480755773268, + "grad_norm": 2.5146985054016113, + "kl": 3.021484375, + "learning_rate": 1.5084516170318181e-05, + "loss": 0.1207, + "reward": 0.564732164144516, + "reward_std": 0.36342857778072357, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.5022321566939354, + "step": 142 + }, + { + "completion_length": 1112.8363342285156, + "epoch": 0.4002799160251924, + "grad_norm": 2.2712340354919434, + "kl": 2.3125, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.0925, + "reward": 0.5818452537059784, + "reward_std": 0.37535255402326584, + "rewards/accuracy_reward": 0.11011905129998922, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4717262014746666, + "step": 143 + }, + { + "completion_length": 1740.1339416503906, + "epoch": 0.40307907627711687, + "grad_norm": 10.124197006225586, + "kl": 4.23046875, + "learning_rate": 1.4915004917131345e-05, + "loss": 0.1692, + "reward": 0.4330357238650322, + "reward_std": 0.3668738007545471, + "rewards/accuracy_reward": 0.06845238269306719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3645833432674408, + "step": 144 + }, + { + "completion_length": 1680.7738342285156, + "epoch": 0.4058782365290413, + "grad_norm": 3.7271039485931396, + "kl": 3.4921875, + "learning_rate": 1.4829539062754597e-05, + "loss": 0.1397, + "reward": 0.4650297686457634, + "reward_std": 0.38645100593566895, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3816964328289032, + "step": 145 + }, + { + "completion_length": 1632.6072082519531, + "epoch": 0.4086773967809657, + "grad_norm": 2.1131505966186523, + "kl": 1.166015625, + "learning_rate": 1.474361062300381e-05, + "loss": 0.0467, + "reward": 0.5156250074505806, + "reward_std": 0.3944535478949547, + "rewards/accuracy_reward": 0.08928571571595967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4263392984867096, + "step": 146 + }, + { + "completion_length": 1932.1816101074219, + "epoch": 0.41147655703289016, + "grad_norm": 1.8890049457550049, + "kl": 0.93701171875, + "learning_rate": 1.4657227828320637e-05, + "loss": 0.0374, + "reward": 0.4754464402794838, + "reward_std": 0.34944573789834976, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4129464402794838, + "step": 147 + }, + { + "completion_length": 2146.7232971191406, + "epoch": 0.41427571728481455, + "grad_norm": 2.31492280960083, + "kl": 1.1572265625, + "learning_rate": 1.4570398952665982e-05, + "loss": 0.0462, + "reward": 0.4285714402794838, + "reward_std": 0.28098214417696, + "rewards/accuracy_reward": 0.0476190485060215, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.380952388048172, + "step": 148 + }, + { + "completion_length": 2317.5238647460938, + "epoch": 0.417074877536739, + "grad_norm": 10.364558219909668, + "kl": 3.060546875, + "learning_rate": 1.4483132312727501e-05, + "loss": 0.1224, + "reward": 0.3772321492433548, + "reward_std": 0.28965678438544273, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3325892835855484, + "step": 149 + }, + { + "completion_length": 2447.544677734375, + "epoch": 0.4198740377886634, + "grad_norm": 0.6838968396186829, + "kl": 0.795654296875, + "learning_rate": 1.4395436267123017e-05, + "loss": 0.0318, + "reward": 0.3802083358168602, + "reward_std": 0.24130939319729805, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3444940596818924, + "step": 150 + }, + { + "completion_length": 2442.21728515625, + "epoch": 0.42267319804058784, + "grad_norm": 2.6140480041503906, + "kl": 1.0859375, + "learning_rate": 1.4307319215599904e-05, + "loss": 0.0434, + "reward": 0.349702388048172, + "reward_std": 0.2428320273756981, + "rewards/accuracy_reward": 0.038690477376803756, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.311011902987957, + "step": 151 + }, + { + "completion_length": 2396.1786499023438, + "epoch": 0.42547235829251223, + "grad_norm": 0.9119054079055786, + "kl": 0.658203125, + "learning_rate": 1.4218789598230536e-05, + "loss": 0.0263, + "reward": 0.3593750074505806, + "reward_std": 0.22598901391029358, + "rewards/accuracy_reward": 0.03273809631355107, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3266369104385376, + "step": 152 + }, + { + "completion_length": 2246.696502685547, + "epoch": 0.4282715185444367, + "grad_norm": 2.479092597961426, + "kl": 0.97021484375, + "learning_rate": 1.4129855894603885e-05, + "loss": 0.0387, + "reward": 0.380952388048172, + "reward_std": 0.23682785406708717, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3541666716337204, + "step": 153 + }, + { + "completion_length": 1910.4017944335938, + "epoch": 0.4310706787963611, + "grad_norm": 2.510755777359009, + "kl": 1.5029296875, + "learning_rate": 1.4040526623013317e-05, + "loss": 0.0602, + "reward": 0.482142873108387, + "reward_std": 0.33759794384241104, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3928571492433548, + "step": 154 + }, + { + "completion_length": 1917.2202453613281, + "epoch": 0.4338698390482855, + "grad_norm": 2.0638437271118164, + "kl": 0.90869140625, + "learning_rate": 1.3950810339640689e-05, + "loss": 0.0365, + "reward": 0.4441964402794838, + "reward_std": 0.30023250356316566, + "rewards/accuracy_reward": 0.06547619216144085, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3787202537059784, + "step": 155 + }, + { + "completion_length": 1813.5268249511719, + "epoch": 0.4366689993002099, + "grad_norm": 1.9543640613555908, + "kl": 1.00341796875, + "learning_rate": 1.3860715637736817e-05, + "loss": 0.0402, + "reward": 0.5000000074505806, + "reward_std": 0.33685530722141266, + "rewards/accuracy_reward": 0.08333333651535213, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4166666716337204, + "step": 156 + }, + { + "completion_length": 1584.5595703125, + "epoch": 0.43946815955213436, + "grad_norm": 1.4789046049118042, + "kl": 1.875, + "learning_rate": 1.3770251146798401e-05, + "loss": 0.075, + "reward": 0.5014881044626236, + "reward_std": 0.3617074117064476, + "rewards/accuracy_reward": 0.1101190522313118, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3913690522313118, + "step": 157 + }, + { + "completion_length": 1436.7262268066406, + "epoch": 0.44226731980405876, + "grad_norm": 1.2094820737838745, + "kl": 1.55322265625, + "learning_rate": 1.367942553174145e-05, + "loss": 0.0622, + "reward": 0.5223214402794838, + "reward_std": 0.39142677187919617, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500149011612, + "step": 158 + }, + { + "completion_length": 1554.3809814453125, + "epoch": 0.4450664800559832, + "grad_norm": 1.0488451719284058, + "kl": 2.0751953125, + "learning_rate": 1.358824749207136e-05, + "loss": 0.0831, + "reward": 0.4055059626698494, + "reward_std": 0.3586099296808243, + "rewards/accuracy_reward": 0.05059523927047849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3549107164144516, + "step": 159 + }, + { + "completion_length": 1514.389892578125, + "epoch": 0.44786564030790765, + "grad_norm": 3.903090476989746, + "kl": 2.955078125, + "learning_rate": 1.3496725761049637e-05, + "loss": 0.1182, + "reward": 0.3831845298409462, + "reward_std": 0.34368982911109924, + "rewards/accuracy_reward": 0.06845238339155912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3147321492433548, + "step": 160 + }, + { + "completion_length": 1630.8125, + "epoch": 0.45066480055983205, + "grad_norm": 1.8900141716003418, + "kl": 2.01953125, + "learning_rate": 1.3404869104857405e-05, + "loss": 0.0809, + "reward": 0.438988097012043, + "reward_std": 0.37636031210422516, + "rewards/accuracy_reward": 0.14880952890962362, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2901785746216774, + "step": 161 + }, + { + "completion_length": 1568.386962890625, + "epoch": 0.4534639608117565, + "grad_norm": 1.7460353374481201, + "kl": 1.625, + "learning_rate": 1.331268632175576e-05, + "loss": 0.0651, + "reward": 0.4308035746216774, + "reward_std": 0.41446660459041595, + "rewards/accuracy_reward": 0.12202381156384945, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3087797686457634, + "step": 162 + }, + { + "completion_length": 1669.6250305175781, + "epoch": 0.4562631210636809, + "grad_norm": 1.3991645574569702, + "kl": 2.38671875, + "learning_rate": 1.3220186241243063e-05, + "loss": 0.0955, + "reward": 0.3028273805975914, + "reward_std": 0.2935003824532032, + "rewards/accuracy_reward": 0.05059523927047849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2522321455180645, + "step": 163 + }, + { + "completion_length": 1595.15185546875, + "epoch": 0.45906228131560534, + "grad_norm": 1.6973552703857422, + "kl": 2.68359375, + "learning_rate": 1.31273777232092e-05, + "loss": 0.1072, + "reward": 0.3534226268529892, + "reward_std": 0.36427438259124756, + "rewards/accuracy_reward": 0.08035714458674192, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.273065485060215, + "step": 164 + }, + { + "completion_length": 1709.4196472167969, + "epoch": 0.46186144156752973, + "grad_norm": 2.2822113037109375, + "kl": 2.6484375, + "learning_rate": 1.3034269657086993e-05, + "loss": 0.106, + "reward": 0.3392857164144516, + "reward_std": 0.3098442368209362, + "rewards/accuracy_reward": 0.08630952634848654, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.252976194024086, + "step": 165 + }, + { + "completion_length": 1606.3333740234375, + "epoch": 0.4646606018194542, + "grad_norm": 1.6501623392105103, + "kl": 2.5, + "learning_rate": 1.2940870961000725e-05, + "loss": 0.1, + "reward": 0.386904776096344, + "reward_std": 0.3547843061387539, + "rewards/accuracy_reward": 0.11904762033373117, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2678571492433548, + "step": 166 + }, + { + "completion_length": 1407.0684814453125, + "epoch": 0.46745976207137857, + "grad_norm": 2.6994175910949707, + "kl": 1.001953125, + "learning_rate": 1.2847190580911942e-05, + "loss": 0.0401, + "reward": 0.4084821492433548, + "reward_std": 0.3186538964509964, + "rewards/accuracy_reward": 0.0863095261156559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3221726194024086, + "step": 167 + }, + { + "completion_length": 1448.0327453613281, + "epoch": 0.470258922323303, + "grad_norm": 1.8725858926773071, + "kl": 1.51953125, + "learning_rate": 1.27532374897626e-05, + "loss": 0.0607, + "reward": 0.3898809626698494, + "reward_std": 0.3414556533098221, + "rewards/accuracy_reward": 0.11011905129998922, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.2797619104385376, + "step": 168 + }, + { + "completion_length": 1313.5803833007812, + "epoch": 0.4730580825752274, + "grad_norm": 3.556281328201294, + "kl": 2.498046875, + "learning_rate": 1.2659020686615602e-05, + "loss": 0.1, + "reward": 0.4055059626698494, + "reward_std": 0.34282371401786804, + "rewards/accuracy_reward": 0.08035714505240321, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3251488134264946, + "step": 169 + }, + { + "completion_length": 1372.2262268066406, + "epoch": 0.47585724282715186, + "grad_norm": 2.2407538890838623, + "kl": 2.48828125, + "learning_rate": 1.2564549195792842e-05, + "loss": 0.0996, + "reward": 0.4263392984867096, + "reward_std": 0.31862910091876984, + "rewards/accuracy_reward": 0.095238097012043, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.331101194024086, + "step": 170 + }, + { + "completion_length": 1369.7976379394531, + "epoch": 0.47865640307907625, + "grad_norm": 1.0283045768737793, + "kl": 1.1318359375, + "learning_rate": 1.2469832066010843e-05, + "loss": 0.0452, + "reward": 0.4203869104385376, + "reward_std": 0.29489460960030556, + "rewards/accuracy_reward": 0.05654762056656182, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3638392984867096, + "step": 171 + }, + { + "completion_length": 1435.0863342285156, + "epoch": 0.4814555633310007, + "grad_norm": 3.8681342601776123, + "kl": 2.9609375, + "learning_rate": 1.237487836951405e-05, + "loss": 0.1186, + "reward": 0.4680059626698494, + "reward_std": 0.35737139731645584, + "rewards/accuracy_reward": 0.13988095335662365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3281250074505806, + "step": 172 + }, + { + "completion_length": 1576.9256591796875, + "epoch": 0.4842547235829251, + "grad_norm": 2.6435940265655518, + "kl": 2.427734375, + "learning_rate": 1.2279697201205852e-05, + "loss": 0.0972, + "reward": 0.4241071566939354, + "reward_std": 0.3397139459848404, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3080357238650322, + "step": 173 + }, + { + "completion_length": 1427.71728515625, + "epoch": 0.48705388383484954, + "grad_norm": 1.935517430305481, + "kl": 1.826171875, + "learning_rate": 1.2184297677777463e-05, + "loss": 0.0731, + "reward": 0.4263392984867096, + "reward_std": 0.32414279878139496, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3668154776096344, + "step": 174 + }, + { + "completion_length": 1593.8095703125, + "epoch": 0.489853044086774, + "grad_norm": 1.108588457107544, + "kl": 2.0859375, + "learning_rate": 1.2088688936834705e-05, + "loss": 0.0834, + "reward": 0.415178582072258, + "reward_std": 0.35114120692014694, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3258928656578064, + "step": 175 + }, + { + "completion_length": 1695.0208740234375, + "epoch": 0.4926522043386984, + "grad_norm": 3.346130847930908, + "kl": 1.9765625, + "learning_rate": 1.1992880136022766e-05, + "loss": 0.079, + "reward": 0.4389881044626236, + "reward_std": 0.35956408083438873, + "rewards/accuracy_reward": 0.0744047649204731, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3645833358168602, + "step": 176 + }, + { + "completion_length": 1525.5892944335938, + "epoch": 0.49545136459062283, + "grad_norm": 1.3358293771743774, + "kl": 2.763671875, + "learning_rate": 1.1896880452149077e-05, + "loss": 0.1106, + "reward": 0.4553571417927742, + "reward_std": 0.36714357137680054, + "rewards/accuracy_reward": 0.08630952658131719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3690476268529892, + "step": 177 + }, + { + "completion_length": 1555.9553833007812, + "epoch": 0.4982505248425472, + "grad_norm": 4.61871337890625, + "kl": 3.32421875, + "learning_rate": 1.1800699080304333e-05, + "loss": 0.1332, + "reward": 0.4345238208770752, + "reward_std": 0.3388953059911728, + "rewards/accuracy_reward": 0.08630952518433332, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3482142984867096, + "step": 178 + }, + { + "completion_length": 1651.6310119628906, + "epoch": 0.5010496850944717, + "grad_norm": 3.9217450618743896, + "kl": 3.23828125, + "learning_rate": 1.170434523298175e-05, + "loss": 0.1295, + "reward": 0.480654776096344, + "reward_std": 0.3782733231782913, + "rewards/accuracy_reward": 0.11309524206444621, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3675595298409462, + "step": 179 + }, + { + "completion_length": 1618.5744323730469, + "epoch": 0.5038488453463961, + "grad_norm": 1.288669466972351, + "kl": 2.3359375, + "learning_rate": 1.1607828139194683e-05, + "loss": 0.0936, + "reward": 0.4441964328289032, + "reward_std": 0.3210986442863941, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3906250074505806, + "step": 180 + }, + { + "completion_length": 1708.3572082519531, + "epoch": 0.5066480055983205, + "grad_norm": 3.251737356185913, + "kl": 1.921875, + "learning_rate": 1.1511157043592642e-05, + "loss": 0.077, + "reward": 0.4665178656578064, + "reward_std": 0.34893832355737686, + "rewards/accuracy_reward": 0.08928571594879031, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3772321566939354, + "step": 181 + }, + { + "completion_length": 1689.7410888671875, + "epoch": 0.509447165850245, + "grad_norm": 3.8254008293151855, + "kl": 2.5390625, + "learning_rate": 1.1414341205575817e-05, + "loss": 0.1017, + "reward": 0.4613095223903656, + "reward_std": 0.36897626519203186, + "rewards/accuracy_reward": 0.09523809794336557, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714328289032, + "step": 182 + }, + { + "completion_length": 1813.2619323730469, + "epoch": 0.5122463261021694, + "grad_norm": 1.4044800996780396, + "kl": 2.666015625, + "learning_rate": 1.1317389898408188e-05, + "loss": 0.1067, + "reward": 0.5007440596818924, + "reward_std": 0.4136379510164261, + "rewards/accuracy_reward": 0.13988095335662365, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.360863097012043, + "step": 183 + }, + { + "completion_length": 1728.4286193847656, + "epoch": 0.5150454863540938, + "grad_norm": 1.253510594367981, + "kl": 2.189453125, + "learning_rate": 1.122031240832932e-05, + "loss": 0.0875, + "reward": 0.4947916716337204, + "reward_std": 0.3844044655561447, + "rewards/accuracy_reward": 0.1130952425301075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3816964402794838, + "step": 184 + }, + { + "completion_length": 1927.1012268066406, + "epoch": 0.5178446466060181, + "grad_norm": 2.080470323562622, + "kl": 2.39453125, + "learning_rate": 1.1123118033664877e-05, + "loss": 0.0956, + "reward": 0.4553571492433548, + "reward_std": 0.38487474620342255, + "rewards/accuracy_reward": 0.11309524113312364, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3422619104385376, + "step": 185 + }, + { + "completion_length": 1885.8452453613281, + "epoch": 0.5206438068579426, + "grad_norm": 2.732903480529785, + "kl": 2.13671875, + "learning_rate": 1.1025816083936036e-05, + "loss": 0.0857, + "reward": 0.4241071492433548, + "reward_std": 0.35953450947999954, + "rewards/accuracy_reward": 0.07440476305782795, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.349702388048172, + "step": 186 + }, + { + "completion_length": 1660.8958740234375, + "epoch": 0.523442967109867, + "grad_norm": 1.6196237802505493, + "kl": 1.767578125, + "learning_rate": 1.0928415878967781e-05, + "loss": 0.0708, + "reward": 0.4672619104385376, + "reward_std": 0.35377056896686554, + "rewards/accuracy_reward": 0.09226190764456987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3750000074505806, + "step": 187 + }, + { + "completion_length": 1907.0000610351562, + "epoch": 0.5262421273617914, + "grad_norm": 1.6180599927902222, + "kl": 2.533203125, + "learning_rate": 1.0830926747996225e-05, + "loss": 0.1014, + "reward": 0.4813988134264946, + "reward_std": 0.38008756190538406, + "rewards/accuracy_reward": 0.095238097012043, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3861607238650322, + "step": 188 + }, + { + "completion_length": 2157.1190490722656, + "epoch": 0.5290412876137159, + "grad_norm": 5.352792739868164, + "kl": 3.13671875, + "learning_rate": 1.073335802877504e-05, + "loss": 0.1255, + "reward": 0.4813988134264946, + "reward_std": 0.3793950453400612, + "rewards/accuracy_reward": 0.10714286006987095, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3742559552192688, + "step": 189 + }, + { + "completion_length": 2155.3453063964844, + "epoch": 0.5318404478656403, + "grad_norm": 1.6584246158599854, + "kl": 2.57421875, + "learning_rate": 1.0635719066681064e-05, + "loss": 0.103, + "reward": 0.412202388048172, + "reward_std": 0.3547248989343643, + "rewards/accuracy_reward": 0.06547619146294892, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.346726194024086, + "step": 190 + }, + { + "completion_length": 2337.6221313476562, + "epoch": 0.5346396081175647, + "grad_norm": 1.1737061738967896, + "kl": 2.140625, + "learning_rate": 1.053801921381916e-05, + "loss": 0.0859, + "reward": 0.3578869104385376, + "reward_std": 0.29282068461179733, + "rewards/accuracy_reward": 0.029761905781924725, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.328125, + "step": 191 + }, + { + "completion_length": 1711.8750305175781, + "epoch": 0.5374387683694891, + "grad_norm": 3.013000726699829, + "kl": 1.392578125, + "learning_rate": 1.0440267828126478e-05, + "loss": 0.0556, + "reward": 0.4367559626698494, + "reward_std": 0.34077152609825134, + "rewards/accuracy_reward": 0.06547619262710214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3712797686457634, + "step": 192 + }, + { + "completion_length": 1668.4494323730469, + "epoch": 0.5402379286214136, + "grad_norm": 4.953486919403076, + "kl": 1.1025390625, + "learning_rate": 1.0342474272476108e-05, + "loss": 0.0442, + "reward": 0.4620535895228386, + "reward_std": 0.3246808350086212, + "rewards/accuracy_reward": 0.07142857322469354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.390625, + "step": 193 + }, + { + "completion_length": 1580.8155212402344, + "epoch": 0.543037088873338, + "grad_norm": 3.358100175857544, + "kl": 1.0615234375, + "learning_rate": 1.0244647913780272e-05, + "loss": 0.0425, + "reward": 0.4985119178891182, + "reward_std": 0.3674391433596611, + "rewards/accuracy_reward": 0.09226190857589245, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500074505806, + "step": 194 + }, + { + "completion_length": 1738.3452453613281, + "epoch": 0.5458362491252624, + "grad_norm": 3.052586555480957, + "kl": 1.978515625, + "learning_rate": 1.0146798122093167e-05, + "loss": 0.079, + "reward": 0.413690485060215, + "reward_std": 0.2985365241765976, + "rewards/accuracy_reward": 0.05059524020180106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3630952462553978, + "step": 195 + }, + { + "completion_length": 2152.5, + "epoch": 0.5486354093771868, + "grad_norm": 6.195072650909424, + "kl": 3.5859375, + "learning_rate": 1.004893426971345e-05, + "loss": 0.1437, + "reward": 0.3742559552192688, + "reward_std": 0.3401479683816433, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3147321492433548, + "step": 196 + }, + { + "completion_length": 1917.0119323730469, + "epoch": 0.5514345696291113, + "grad_norm": 7.311130046844482, + "kl": 3.46484375, + "learning_rate": 9.951065730286553e-06, + "loss": 0.1387, + "reward": 0.3549107238650322, + "reward_std": 0.32089220359921455, + "rewards/accuracy_reward": 0.03869047714397311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3162202462553978, + "step": 197 + }, + { + "completion_length": 2022.8363647460938, + "epoch": 0.5542337298810357, + "grad_norm": 9.176398277282715, + "kl": 5.3984375, + "learning_rate": 9.853201877906836e-06, + "loss": 0.2158, + "reward": 0.3980654925107956, + "reward_std": 0.3774467706680298, + "rewards/accuracy_reward": 0.10119047947227955, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.296875, + "step": 198 + }, + { + "completion_length": 1690.386962890625, + "epoch": 0.5570328901329601, + "grad_norm": 2.576979160308838, + "kl": 2.515625, + "learning_rate": 9.755352086219733e-06, + "loss": 0.1009, + "reward": 0.4017857238650322, + "reward_std": 0.3037550374865532, + "rewards/accuracy_reward": 0.05654761986806989, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3452381044626236, + "step": 199 + }, + { + "completion_length": 1661.7619323730469, + "epoch": 0.5598320503848845, + "grad_norm": 2.2454562187194824, + "kl": 2.4375, + "learning_rate": 9.657525727523897e-06, + "loss": 0.0977, + "reward": 0.4285714402794838, + "reward_std": 0.35358382761478424, + "rewards/accuracy_reward": 0.08333333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3452381044626236, + "step": 200 + }, + { + "completion_length": 1493.264892578125, + "epoch": 0.562631210636809, + "grad_norm": 2.4267282485961914, + "kl": 1.494140625, + "learning_rate": 9.559732171873524e-06, + "loss": 0.0596, + "reward": 0.4538690596818924, + "reward_std": 0.345071017742157, + "rewards/accuracy_reward": 0.08333333465270698, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357238650322, + "step": 201 + }, + { + "completion_length": 1501.139892578125, + "epoch": 0.5654303708887334, + "grad_norm": 1.8665298223495483, + "kl": 1.751953125, + "learning_rate": 9.461980786180844e-06, + "loss": 0.0701, + "reward": 0.4471726343035698, + "reward_std": 0.3375595882534981, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.363839291036129, + "step": 202 + }, + { + "completion_length": 1641.7619323730469, + "epoch": 0.5682295311406578, + "grad_norm": 1.5801560878753662, + "kl": 2.5859375, + "learning_rate": 9.364280933318943e-06, + "loss": 0.1033, + "reward": 0.4069940522313118, + "reward_std": 0.31903109699487686, + "rewards/accuracy_reward": 0.06845238339155912, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3385416641831398, + "step": 203 + }, + { + "completion_length": 1789.0565490722656, + "epoch": 0.5710286913925823, + "grad_norm": 5.1003594398498535, + "kl": 4.50390625, + "learning_rate": 9.266641971224963e-06, + "loss": 0.1803, + "reward": 0.4136904776096344, + "reward_std": 0.33030133321881294, + "rewards/accuracy_reward": 0.08333333604969084, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3303571492433548, + "step": 204 + }, + { + "completion_length": 1651.4851379394531, + "epoch": 0.5738278516445067, + "grad_norm": 6.3953142166137695, + "kl": 4.76171875, + "learning_rate": 9.16907325200378e-06, + "loss": 0.1904, + "reward": 0.4315476194024086, + "reward_std": 0.36674462258815765, + "rewards/accuracy_reward": 0.095238097012043, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3363095223903656, + "step": 205 + }, + { + "completion_length": 1566.9464721679688, + "epoch": 0.5766270118964311, + "grad_norm": 3.042302131652832, + "kl": 4.4921875, + "learning_rate": 9.071584121032224e-06, + "loss": 0.1795, + "reward": 0.3854166716337204, + "reward_std": 0.33248353749513626, + "rewards/accuracy_reward": 0.05952381016686559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3258928582072258, + "step": 206 + }, + { + "completion_length": 1436.8363037109375, + "epoch": 0.5794261721483555, + "grad_norm": 6.986565113067627, + "kl": 3.0703125, + "learning_rate": 8.974183916063967e-06, + "loss": 0.1227, + "reward": 0.3988095372915268, + "reward_std": 0.3306507095694542, + "rewards/accuracy_reward": 0.05654762056656182, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3422619104385376, + "step": 207 + }, + { + "completion_length": 1366.5744323730469, + "epoch": 0.58222533240028, + "grad_norm": 8.636092185974121, + "kl": 2.53515625, + "learning_rate": 8.876881966335128e-06, + "loss": 0.1014, + "reward": 0.4479166716337204, + "reward_std": 0.3630291298031807, + "rewards/accuracy_reward": 0.08928571734577417, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3586309552192688, + "step": 208 + }, + { + "completion_length": 1468.5625, + "epoch": 0.5850244926522044, + "grad_norm": 8.378006935119629, + "kl": 2.90625, + "learning_rate": 8.779687591670687e-06, + "loss": 0.1164, + "reward": 0.4151785746216774, + "reward_std": 0.35592009127140045, + "rewards/accuracy_reward": 0.07440476445481181, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3407738134264946, + "step": 209 + }, + { + "completion_length": 1453.7380981445312, + "epoch": 0.5878236529041287, + "grad_norm": 4.64111328125, + "kl": 5.40625, + "learning_rate": 8.682610101591813e-06, + "loss": 0.2163, + "reward": 0.3898809626698494, + "reward_std": 0.3686796501278877, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3184523805975914, + "step": 210 + }, + { + "completion_length": 1667.4732360839844, + "epoch": 0.5906228131560531, + "grad_norm": 9.325288772583008, + "kl": 7.6015625, + "learning_rate": 8.585658794424188e-06, + "loss": 0.3045, + "reward": 0.4419642835855484, + "reward_std": 0.35975363850593567, + "rewards/accuracy_reward": 0.0863095261156559, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.355654776096344, + "step": 211 + }, + { + "completion_length": 1612.9613342285156, + "epoch": 0.5934219734079776, + "grad_norm": 25.526140213012695, + "kl": 11.859375, + "learning_rate": 8.488842956407361e-06, + "loss": 0.4744, + "reward": 0.396577388048172, + "reward_std": 0.3830376863479614, + "rewards/accuracy_reward": 0.08035714412108064, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3162202462553978, + "step": 212 + }, + { + "completion_length": 1545.3958740234375, + "epoch": 0.596221133659902, + "grad_norm": 12.208528518676758, + "kl": 8.90625, + "learning_rate": 8.39217186080532e-06, + "loss": 0.3569, + "reward": 0.4479166865348816, + "reward_std": 0.36742912232875824, + "rewards/accuracy_reward": 0.08630952658131719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3616071492433548, + "step": 213 + }, + { + "completion_length": 1553.7649230957031, + "epoch": 0.5990202939118264, + "grad_norm": 4.715460300445557, + "kl": 7.6171875, + "learning_rate": 8.295654767018254e-06, + "loss": 0.3048, + "reward": 0.4069940522313118, + "reward_std": 0.3455456346273422, + "rewards/accuracy_reward": 0.04761904804036021, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.359375, + "step": 214 + }, + { + "completion_length": 1549.9524230957031, + "epoch": 0.6018194541637508, + "grad_norm": 6.758280277252197, + "kl": 4.3515625, + "learning_rate": 8.19930091969567e-06, + "loss": 0.1742, + "reward": 0.4174107238650322, + "reward_std": 0.3410634845495224, + "rewards/accuracy_reward": 0.07738095475360751, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3400297611951828, + "step": 215 + }, + { + "completion_length": 1543.9256286621094, + "epoch": 0.6046186144156753, + "grad_norm": 9.901355743408203, + "kl": 3.36328125, + "learning_rate": 8.103119547850924e-06, + "loss": 0.1345, + "reward": 0.4479166716337204, + "reward_std": 0.3812849670648575, + "rewards/accuracy_reward": 0.06250000232830644, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3854166716337204, + "step": 216 + }, + { + "completion_length": 1461.9345397949219, + "epoch": 0.6074177746675997, + "grad_norm": 11.066727638244629, + "kl": 2.88671875, + "learning_rate": 8.00711986397724e-06, + "loss": 0.1155, + "reward": 0.4709821492433548, + "reward_std": 0.3499302640557289, + "rewards/accuracy_reward": 0.07142857345752418, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3995535746216774, + "step": 217 + }, + { + "completion_length": 1540.044677734375, + "epoch": 0.6102169349195241, + "grad_norm": 8.582736015319824, + "kl": 3.6953125, + "learning_rate": 7.911311063165298e-06, + "loss": 0.148, + "reward": 0.4330357238650322, + "reward_std": 0.3405921831727028, + "rewards/accuracy_reward": 0.04464285867288709, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3883928656578064, + "step": 218 + }, + { + "completion_length": 1344.5506286621094, + "epoch": 0.6130160951714486, + "grad_norm": 2.399972915649414, + "kl": 5.453125, + "learning_rate": 7.815702322222539e-06, + "loss": 0.2185, + "reward": 0.4583333432674408, + "reward_std": 0.3629949390888214, + "rewards/accuracy_reward": 0.05654762079939246, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4017857238650322, + "step": 219 + }, + { + "completion_length": 1495.5089416503906, + "epoch": 0.615815255423373, + "grad_norm": 20.742280960083008, + "kl": 10.546875, + "learning_rate": 7.720302798794153e-06, + "loss": 0.4212, + "reward": 0.476190485060215, + "reward_std": 0.4082324057817459, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3720238134264946, + "step": 220 + }, + { + "completion_length": 1365.5804138183594, + "epoch": 0.6186144156752974, + "grad_norm": 33.20966720581055, + "kl": 13.78125, + "learning_rate": 7.6251216304859555e-06, + "loss": 0.5513, + "reward": 0.4389881044626236, + "reward_std": 0.3705972507596016, + "rewards/accuracy_reward": 0.07142857415601611, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3675595298409462, + "step": 221 + }, + { + "completion_length": 1485.7589721679688, + "epoch": 0.6214135759272218, + "grad_norm": 40.53151321411133, + "kl": 15.125, + "learning_rate": 7.530167933989161e-06, + "loss": 0.6056, + "reward": 0.3928571492433548, + "reward_std": 0.34987135231494904, + "rewards/accuracy_reward": 0.0565476194024086, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3363095298409462, + "step": 222 + }, + { + "completion_length": 1317.047607421875, + "epoch": 0.6242127361791463, + "grad_norm": 22.07911491394043, + "kl": 11.40625, + "learning_rate": 7.435450804207165e-06, + "loss": 0.4567, + "reward": 0.4315476268529892, + "reward_std": 0.344863198697567, + "rewards/accuracy_reward": 0.05357142956927419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.377976194024086, + "step": 223 + }, + { + "completion_length": 1266.4821472167969, + "epoch": 0.6270118964310707, + "grad_norm": 8.838082313537598, + "kl": 8.625, + "learning_rate": 7.340979313384404e-06, + "loss": 0.3443, + "reward": 0.413690485060215, + "reward_std": 0.34003881365060806, + "rewards/accuracy_reward": 0.03273809631355107, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3809523805975914, + "step": 224 + }, + { + "completion_length": 1293.1190795898438, + "epoch": 0.6298110566829951, + "grad_norm": 7.68155574798584, + "kl": 4.62109375, + "learning_rate": 7.246762510237404e-06, + "loss": 0.1848, + "reward": 0.4233631044626236, + "reward_std": 0.3254314064979553, + "rewards/accuracy_reward": 0.0505952388048172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3727678656578064, + "step": 225 + }, + { + "completion_length": 1415.7381286621094, + "epoch": 0.6326102169349195, + "grad_norm": 10.290997505187988, + "kl": 4.11328125, + "learning_rate": 7.1528094190880625e-06, + "loss": 0.1645, + "reward": 0.3787202462553978, + "reward_std": 0.3096393644809723, + "rewards/accuracy_reward": 0.017857143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3608631044626236, + "step": 226 + }, + { + "completion_length": 1318.9434509277344, + "epoch": 0.635409377186844, + "grad_norm": 11.238645553588867, + "kl": 3.35546875, + "learning_rate": 7.059129038999282e-06, + "loss": 0.1342, + "reward": 0.4732143059372902, + "reward_std": 0.3673306256532669, + "rewards/accuracy_reward": 0.06845238246023655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4047619178891182, + "step": 227 + }, + { + "completion_length": 1405.3839111328125, + "epoch": 0.6382085374387684, + "grad_norm": 10.787981986999512, + "kl": 2.9609375, + "learning_rate": 6.965730342913011e-06, + "loss": 0.1184, + "reward": 0.4419642984867096, + "reward_std": 0.3347252234816551, + "rewards/accuracy_reward": 0.0505952388048172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3913690447807312, + "step": 228 + }, + { + "completion_length": 1314.3452453613281, + "epoch": 0.6410076976906928, + "grad_norm": 7.792874336242676, + "kl": 4.4140625, + "learning_rate": 6.872622276790804e-06, + "loss": 0.1765, + "reward": 0.451636902987957, + "reward_std": 0.35847172886133194, + "rewards/accuracy_reward": 0.06547619285993278, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3861607238650322, + "step": 229 + }, + { + "completion_length": 1399.3661193847656, + "epoch": 0.6438068579426172, + "grad_norm": 3.925266742706299, + "kl": 7.453125, + "learning_rate": 6.779813758756943e-06, + "loss": 0.2983, + "reward": 0.3638392984867096, + "reward_std": 0.2915553152561188, + "rewards/accuracy_reward": 0.014880952658131719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3489583432674408, + "step": 230 + }, + { + "completion_length": 1295.4732360839844, + "epoch": 0.6466060181945417, + "grad_norm": 3.424492120742798, + "kl": 7.1484375, + "learning_rate": 6.687313678244243e-06, + "loss": 0.2856, + "reward": 0.3928571492433548, + "reward_std": 0.3187793642282486, + "rewards/accuracy_reward": 0.023809524718672037, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3690476194024086, + "step": 231 + }, + { + "completion_length": 1281.0714416503906, + "epoch": 0.6494051784464661, + "grad_norm": 2.5056002140045166, + "kl": 6.6953125, + "learning_rate": 6.595130895142601e-06, + "loss": 0.2675, + "reward": 0.4285714402794838, + "reward_std": 0.35016684234142303, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3690476343035698, + "step": 232 + }, + { + "completion_length": 1502.2589721679688, + "epoch": 0.6522043386983905, + "grad_norm": 1.534533977508545, + "kl": 5.55859375, + "learning_rate": 6.5032742389503676e-06, + "loss": 0.2221, + "reward": 0.3772321492433548, + "reward_std": 0.2965746596455574, + "rewards/accuracy_reward": 0.02678571525029838, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3504464328289032, + "step": 233 + }, + { + "completion_length": 1355.3065795898438, + "epoch": 0.655003498950315, + "grad_norm": 6.90831184387207, + "kl": 4.24609375, + "learning_rate": 6.411752507928643e-06, + "loss": 0.1697, + "reward": 0.491815485060215, + "reward_std": 0.3878900036215782, + "rewards/accuracy_reward": 0.08630952634848654, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4055059552192688, + "step": 234 + }, + { + "completion_length": 1345.2649230957031, + "epoch": 0.6578026592022393, + "grad_norm": 3.9950220584869385, + "kl": 4.9296875, + "learning_rate": 6.3205744682585545e-06, + "loss": 0.1972, + "reward": 0.4397321492433548, + "reward_std": 0.35445038229227066, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3772321492433548, + "step": 235 + }, + { + "completion_length": 1201.6904907226562, + "epoch": 0.6606018194541637, + "grad_norm": 4.134045124053955, + "kl": 4.6796875, + "learning_rate": 6.229748853201605e-06, + "loss": 0.1875, + "reward": 0.4873512014746666, + "reward_std": 0.3461935296654701, + "rewards/accuracy_reward": 0.04761904897168279, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4397321492433548, + "step": 236 + }, + { + "completion_length": 1371.9345397949219, + "epoch": 0.6634009797060881, + "grad_norm": 11.206262588500977, + "kl": 8.2734375, + "learning_rate": 6.139284362263185e-06, + "loss": 0.3307, + "reward": 0.4516369104385376, + "reward_std": 0.3695817217230797, + "rewards/accuracy_reward": 0.07142857392318547, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3802083432674408, + "step": 237 + }, + { + "completion_length": 1237.1904907226562, + "epoch": 0.6662001399580126, + "grad_norm": 10.295726776123047, + "kl": 8.546875, + "learning_rate": 6.049189660359316e-06, + "loss": 0.342, + "reward": 0.4367559626698494, + "reward_std": 0.32462088763713837, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.401041679084301, + "step": 238 + }, + { + "completion_length": 1261.9673156738281, + "epoch": 0.668999300209937, + "grad_norm": 12.13981819152832, + "kl": 8.53125, + "learning_rate": 5.959473376986686e-06, + "loss": 0.3417, + "reward": 0.3921131044626236, + "reward_std": 0.29936332255601883, + "rewards/accuracy_reward": 0.032738096080720425, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.359375, + "step": 239 + }, + { + "completion_length": 1295.8750305175781, + "epoch": 0.6717984604618614, + "grad_norm": 4.599477767944336, + "kl": 7.453125, + "learning_rate": 5.8701441053961185e-06, + "loss": 0.2985, + "reward": 0.4077381044626236, + "reward_std": 0.30625439435243607, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3809523954987526, + "step": 240 + }, + { + "completion_length": 1243.9226379394531, + "epoch": 0.6745976207137858, + "grad_norm": 5.511775970458984, + "kl": 5.359375, + "learning_rate": 5.781210401769466e-06, + "loss": 0.2147, + "reward": 0.3891369104385376, + "reward_std": 0.30660995095968246, + "rewards/accuracy_reward": 0.017857143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3712797686457634, + "step": 241 + }, + { + "completion_length": 1342.9434814453125, + "epoch": 0.6773967809657103, + "grad_norm": 8.561990737915039, + "kl": 4.61328125, + "learning_rate": 5.692680784400102e-06, + "loss": 0.1848, + "reward": 0.4322916865348816, + "reward_std": 0.3353210613131523, + "rewards/accuracy_reward": 0.05357143026776612, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3787202462553978, + "step": 242 + }, + { + "completion_length": 1356.8809814453125, + "epoch": 0.6801959412176347, + "grad_norm": 4.397611141204834, + "kl": 5.734375, + "learning_rate": 5.604563732876989e-06, + "loss": 0.2292, + "reward": 0.3720238208770752, + "reward_std": 0.30530185252428055, + "rewards/accuracy_reward": 0.014880952425301075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3571428656578064, + "step": 243 + }, + { + "completion_length": 1383.9255981445312, + "epoch": 0.6829951014695591, + "grad_norm": 1.6770597696304321, + "kl": 6.1953125, + "learning_rate": 5.516867687272504e-06, + "loss": 0.2481, + "reward": 0.3898809552192688, + "reward_std": 0.3356594890356064, + "rewards/accuracy_reward": 0.04464285774156451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3452381044626236, + "step": 244 + }, + { + "completion_length": 1272.5833435058594, + "epoch": 0.6857942617214835, + "grad_norm": 2.74718976020813, + "kl": 5.53125, + "learning_rate": 5.429601047334022e-06, + "loss": 0.2214, + "reward": 0.401041679084301, + "reward_std": 0.31402096152305603, + "rewards/accuracy_reward": 0.04166666814126074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.359375, + "step": 245 + }, + { + "completion_length": 1301.0327758789062, + "epoch": 0.688593421973408, + "grad_norm": 2.0039477348327637, + "kl": 5.4140625, + "learning_rate": 5.342772171679364e-06, + "loss": 0.2168, + "reward": 0.4002976343035698, + "reward_std": 0.3251708596944809, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3645833432674408, + "step": 246 + }, + { + "completion_length": 1399.3333740234375, + "epoch": 0.6913925822253324, + "grad_norm": 1.3752001523971558, + "kl": 5.7578125, + "learning_rate": 5.256389376996192e-06, + "loss": 0.2303, + "reward": 0.3906250074505806, + "reward_std": 0.29199953749775887, + "rewards/accuracy_reward": 0.02380952425301075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.366815485060215, + "step": 247 + }, + { + "completion_length": 1267.5119018554688, + "epoch": 0.6941917424772568, + "grad_norm": 1.7263340950012207, + "kl": 5.84375, + "learning_rate": 5.17046093724541e-06, + "loss": 0.2338, + "reward": 0.4084821492433548, + "reward_std": 0.3570159748196602, + "rewards/accuracy_reward": 0.06250000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3459821492433548, + "step": 248 + }, + { + "completion_length": 1337.2440795898438, + "epoch": 0.6969909027291813, + "grad_norm": 3.745554208755493, + "kl": 6.515625, + "learning_rate": 5.084995082868658e-06, + "loss": 0.2608, + "reward": 0.3973214328289032, + "reward_std": 0.3254318907856941, + "rewards/accuracy_reward": 0.041666667675599456, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3556547686457634, + "step": 249 + }, + { + "completion_length": 1396.2857360839844, + "epoch": 0.6997900629811057, + "grad_norm": 6.773102760314941, + "kl": 4.6796875, + "learning_rate": 5.000000000000003e-06, + "loss": 0.1874, + "reward": 0.4040178656578064, + "reward_std": 0.32433080673217773, + "rewards/accuracy_reward": 0.05357142956927419, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3504464402794838, + "step": 250 + }, + { + "completion_length": 1296.1488647460938, + "epoch": 0.7025892232330301, + "grad_norm": 4.725771427154541, + "kl": 4.8515625, + "learning_rate": 4.9154838296818246e-06, + "loss": 0.194, + "reward": 0.399553582072258, + "reward_std": 0.33593638241291046, + "rewards/accuracy_reward": 0.05654761986806989, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3430059552192688, + "step": 251 + }, + { + "completion_length": 1240.3958740234375, + "epoch": 0.7053883834849545, + "grad_norm": 4.367389678955078, + "kl": 5.171875, + "learning_rate": 4.831454667085059e-06, + "loss": 0.2071, + "reward": 0.4561012014746666, + "reward_std": 0.36951150745153427, + "rewards/accuracy_reward": 0.0773809552192688, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3787202462553978, + "step": 252 + }, + { + "completion_length": 1372.3809814453125, + "epoch": 0.708187543736879, + "grad_norm": 6.697968482971191, + "kl": 7.4375, + "learning_rate": 4.747920560733825e-06, + "loss": 0.2972, + "reward": 0.4397321492433548, + "reward_std": 0.3764253333210945, + "rewards/accuracy_reward": 0.07738095475360751, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3623512014746666, + "step": 253 + }, + { + "completion_length": 1380.2976684570312, + "epoch": 0.7109867039888034, + "grad_norm": 7.315332889556885, + "kl": 7.265625, + "learning_rate": 4.664889511734509e-06, + "loss": 0.2909, + "reward": 0.4188988134264946, + "reward_std": 0.350118488073349, + "rewards/accuracy_reward": 0.056547620333731174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3623512014746666, + "step": 254 + }, + { + "completion_length": 1301.0565795898438, + "epoch": 0.7137858642407278, + "grad_norm": 5.572265625, + "kl": 6.640625, + "learning_rate": 4.58236947300939e-06, + "loss": 0.2657, + "reward": 0.4248512089252472, + "reward_std": 0.3546188771724701, + "rewards/accuracy_reward": 0.07142857182770967, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3534226268529892, + "step": 255 + }, + { + "completion_length": 1290.7500305175781, + "epoch": 0.7165850244926522, + "grad_norm": 2.190363645553589, + "kl": 5.61328125, + "learning_rate": 4.500368348534918e-06, + "loss": 0.2247, + "reward": 0.4233631044626236, + "reward_std": 0.3518420085310936, + "rewards/accuracy_reward": 0.07738095475360751, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3459821417927742, + "step": 256 + }, + { + "completion_length": 1240.169677734375, + "epoch": 0.7193841847445767, + "grad_norm": 5.1395158767700195, + "kl": 4.6875, + "learning_rate": 4.418893992584624e-06, + "loss": 0.1876, + "reward": 0.4166666716337204, + "reward_std": 0.36202676594257355, + "rewards/accuracy_reward": 0.05952381086535752, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3571428656578064, + "step": 257 + }, + { + "completion_length": 1318.6726684570312, + "epoch": 0.722183344996501, + "grad_norm": 8.188876152038574, + "kl": 4.14453125, + "learning_rate": 4.33795420897683e-06, + "loss": 0.1659, + "reward": 0.3995535746216774, + "reward_std": 0.32532942295074463, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.363839291036129, + "step": 258 + }, + { + "completion_length": 1328.4286193847656, + "epoch": 0.7249825052484254, + "grad_norm": 4.540230751037598, + "kl": 4.91015625, + "learning_rate": 4.257556750327176e-06, + "loss": 0.1964, + "reward": 0.3861607164144516, + "reward_std": 0.3183294087648392, + "rewards/accuracy_reward": 0.026785715017467737, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3593750074505806, + "step": 259 + }, + { + "completion_length": 1330.0566101074219, + "epoch": 0.72778166550035, + "grad_norm": 5.644837856292725, + "kl": 4.78515625, + "learning_rate": 4.17770931730606e-06, + "loss": 0.1913, + "reward": 0.3958333507180214, + "reward_std": 0.30248239636421204, + "rewards/accuracy_reward": 0.029761905781924725, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714328289032, + "step": 260 + }, + { + "completion_length": 1423.4256286621094, + "epoch": 0.7305808257522743, + "grad_norm": 3.3239707946777344, + "kl": 6.5859375, + "learning_rate": 4.098419557901036e-06, + "loss": 0.2634, + "reward": 0.4002976194024086, + "reward_std": 0.3251989297568798, + "rewards/accuracy_reward": 0.04166666744276881, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3586309552192688, + "step": 261 + }, + { + "completion_length": 1285.2202758789062, + "epoch": 0.7333799860041987, + "grad_norm": 12.05953598022461, + "kl": 7.9765625, + "learning_rate": 4.019695066684285e-06, + "loss": 0.3188, + "reward": 0.3921131044626236, + "reward_std": 0.34052395820617676, + "rewards/accuracy_reward": 0.05357142980210483, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3385416716337204, + "step": 262 + }, + { + "completion_length": 1377.1815795898438, + "epoch": 0.7361791462561231, + "grad_norm": 7.603522777557373, + "kl": 6.7109375, + "learning_rate": 3.9415433840851845e-06, + "loss": 0.2687, + "reward": 0.398065485060215, + "reward_std": 0.3070693612098694, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3534226268529892, + "step": 263 + }, + { + "completion_length": 1344.1726684570312, + "epoch": 0.7389783065080476, + "grad_norm": 1.4179707765579224, + "kl": 5.3203125, + "learning_rate": 3.8639719956680624e-06, + "loss": 0.2133, + "reward": 0.4188988283276558, + "reward_std": 0.32210823148489, + "rewards/accuracy_reward": 0.05654762149788439, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.362351194024086, + "step": 264 + }, + { + "completion_length": 1264.107177734375, + "epoch": 0.741777466759972, + "grad_norm": 3.2450530529022217, + "kl": 4.47265625, + "learning_rate": 3.7869883314152114e-06, + "loss": 0.1792, + "reward": 0.3757440522313118, + "reward_std": 0.27574628219008446, + "rewards/accuracy_reward": 0.011904762359336019, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3638392984867096, + "step": 265 + }, + { + "completion_length": 1407.327392578125, + "epoch": 0.7445766270118964, + "grad_norm": 4.581850051879883, + "kl": 3.7890625, + "learning_rate": 3.7105997650152326e-06, + "loss": 0.1517, + "reward": 0.4494047611951828, + "reward_std": 0.32648998498916626, + "rewards/accuracy_reward": 0.05654762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3928571492433548, + "step": 266 + }, + { + "completion_length": 1342.452392578125, + "epoch": 0.7473757872638208, + "grad_norm": 4.61456298828125, + "kl": 3.7734375, + "learning_rate": 3.6348136131567537e-06, + "loss": 0.1511, + "reward": 0.4255952462553978, + "reward_std": 0.3115469329059124, + "rewards/accuracy_reward": 0.0505952388048172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3750000074505806, + "step": 267 + }, + { + "completion_length": 1301.7678833007812, + "epoch": 0.7501749475157453, + "grad_norm": 1.881331443786621, + "kl": 4.50390625, + "learning_rate": 3.5596371348276325e-06, + "loss": 0.1802, + "reward": 0.3928571492433548, + "reward_std": 0.2952713444828987, + "rewards/accuracy_reward": 0.020833333721384406, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3720238134264946, + "step": 268 + }, + { + "completion_length": 1408.3869018554688, + "epoch": 0.7529741077676697, + "grad_norm": 3.126847743988037, + "kl": 5.6953125, + "learning_rate": 3.485077530619664e-06, + "loss": 0.2277, + "reward": 0.4129464328289032, + "reward_std": 0.2988455295562744, + "rewards/accuracy_reward": 0.020833333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3921131044626236, + "step": 269 + }, + { + "completion_length": 1398.3423156738281, + "epoch": 0.7557732680195941, + "grad_norm": 8.754128456115723, + "kl": 6.8125, + "learning_rate": 3.4111419420388904e-06, + "loss": 0.2724, + "reward": 0.3616071492433548, + "reward_std": 0.27205846458673477, + "rewards/accuracy_reward": 0.011904762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.349702388048172, + "step": 270 + }, + { + "completion_length": 1326.4911193847656, + "epoch": 0.7585724282715185, + "grad_norm": 3.1750030517578125, + "kl": 5.984375, + "learning_rate": 3.3378374508215704e-06, + "loss": 0.2394, + "reward": 0.4315476268529892, + "reward_std": 0.3337167501449585, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3601190596818924, + "step": 271 + }, + { + "completion_length": 1418.4197082519531, + "epoch": 0.761371588523443, + "grad_norm": 2.0993704795837402, + "kl": 5.3046875, + "learning_rate": 3.2651710782558798e-06, + "loss": 0.2122, + "reward": 0.4672619178891182, + "reward_std": 0.37080781161785126, + "rewards/accuracy_reward": 0.06845238292589784, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3988095372915268, + "step": 272 + }, + { + "completion_length": 1315.8214721679688, + "epoch": 0.7641707487753674, + "grad_norm": 2.2858619689941406, + "kl": 5.890625, + "learning_rate": 3.1931497845093753e-06, + "loss": 0.2354, + "reward": 0.4181547686457634, + "reward_std": 0.33285027742385864, + "rewards/accuracy_reward": 0.029761905781924725, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3883928656578064, + "step": 273 + }, + { + "completion_length": 1437.1935119628906, + "epoch": 0.7669699090272918, + "grad_norm": 5.111063003540039, + "kl": 6.6015625, + "learning_rate": 3.121780467962353e-06, + "loss": 0.264, + "reward": 0.396577388048172, + "reward_std": 0.361224927008152, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3370535746216774, + "step": 274 + }, + { + "completion_length": 1372.7500305175781, + "epoch": 0.7697690692792163, + "grad_norm": 3.3286380767822266, + "kl": 4.90625, + "learning_rate": 3.0510699645470988e-06, + "loss": 0.1963, + "reward": 0.4002976268529892, + "reward_std": 0.30905013531446457, + "rewards/accuracy_reward": 0.023809524718672037, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.376488097012043, + "step": 275 + }, + { + "completion_length": 1388.5565795898438, + "epoch": 0.7725682295311407, + "grad_norm": 3.681853771209717, + "kl": 5.109375, + "learning_rate": 2.981025047093118e-06, + "loss": 0.2045, + "reward": 0.3973214402794838, + "reward_std": 0.31496061384677887, + "rewards/accuracy_reward": 0.023809524020180106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3735119104385376, + "step": 276 + }, + { + "completion_length": 1337.7916870117188, + "epoch": 0.7753673897830651, + "grad_norm": 1.2482109069824219, + "kl": 5.671875, + "learning_rate": 2.911652424678425e-06, + "loss": 0.2271, + "reward": 0.450892873108387, + "reward_std": 0.3896239772439003, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3616071417927742, + "step": 277 + }, + { + "completion_length": 1366.8065795898438, + "epoch": 0.7781665500349895, + "grad_norm": 2.4408419132232666, + "kl": 5.7578125, + "learning_rate": 2.8429587419869288e-06, + "loss": 0.2301, + "reward": 0.4144345298409462, + "reward_std": 0.3503517061471939, + "rewards/accuracy_reward": 0.04761904873885214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3668154776096344, + "step": 278 + }, + { + "completion_length": 1322.5059814453125, + "epoch": 0.780965710286914, + "grad_norm": 2.8470442295074463, + "kl": 6.03125, + "learning_rate": 2.77495057867198e-06, + "loss": 0.2415, + "reward": 0.3861607238650322, + "reward_std": 0.32331252098083496, + "rewards/accuracy_reward": 0.03571428614668548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3504464328289032, + "step": 279 + }, + { + "completion_length": 1425.0238647460938, + "epoch": 0.7837648705388384, + "grad_norm": 2.543036460876465, + "kl": 5.3125, + "learning_rate": 2.7076344487261695e-06, + "loss": 0.2128, + "reward": 0.4226190522313118, + "reward_std": 0.33169643953442574, + "rewards/accuracy_reward": 0.0505952388048172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3720238134264946, + "step": 280 + }, + { + "completion_length": 1343.0387268066406, + "epoch": 0.7865640307907628, + "grad_norm": 3.472309112548828, + "kl": 4.32421875, + "learning_rate": 2.6410167998573945e-06, + "loss": 0.1731, + "reward": 0.4025297686457634, + "reward_std": 0.31637272238731384, + "rewards/accuracy_reward": 0.032738095382228494, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3697916716337204, + "step": 281 + }, + { + "completion_length": 1440.7142944335938, + "epoch": 0.7893631910426872, + "grad_norm": 2.190171480178833, + "kl": 5.9453125, + "learning_rate": 2.57510401287128e-06, + "loss": 0.2377, + "reward": 0.383928582072258, + "reward_std": 0.3282015360891819, + "rewards/accuracy_reward": 0.035714287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.348214291036129, + "step": 282 + }, + { + "completion_length": 1531.0774536132812, + "epoch": 0.7921623512946117, + "grad_norm": 2.0192794799804688, + "kl": 5.265625, + "learning_rate": 2.5099024010600136e-06, + "loss": 0.2104, + "reward": 0.412202388048172, + "reward_std": 0.3332761228084564, + "rewards/accuracy_reward": 0.04166666744276881, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357238650322, + "step": 283 + }, + { + "completion_length": 1418.4345397949219, + "epoch": 0.794961511546536, + "grad_norm": 1.8403772115707397, + "kl": 5.08203125, + "learning_rate": 2.445418209597632e-06, + "loss": 0.2031, + "reward": 0.3898809626698494, + "reward_std": 0.3033088222146034, + "rewards/accuracy_reward": 0.023809524020180106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714402794838, + "step": 284 + }, + { + "completion_length": 1420.5357360839844, + "epoch": 0.7977606717984604, + "grad_norm": 4.50986385345459, + "kl": 5.671875, + "learning_rate": 2.381657614941858e-06, + "loss": 0.2268, + "reward": 0.4181547686457634, + "reward_std": 0.3235014081001282, + "rewards/accuracy_reward": 0.03869047691114247, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.379464291036129, + "step": 285 + }, + { + "completion_length": 1432.2560119628906, + "epoch": 0.8005598320503848, + "grad_norm": 1.5560705661773682, + "kl": 5.09375, + "learning_rate": 2.318626724242491e-06, + "loss": 0.2039, + "reward": 0.4293154776096344, + "reward_std": 0.3335767611861229, + "rewards/accuracy_reward": 0.06250000209547579, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.366815485060215, + "step": 286 + }, + { + "completion_length": 1490.09228515625, + "epoch": 0.8033589923023093, + "grad_norm": 5.74359655380249, + "kl": 6.3671875, + "learning_rate": 2.2563315747564575e-06, + "loss": 0.2549, + "reward": 0.3787202462553978, + "reward_std": 0.3090299814939499, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3578869178891182, + "step": 287 + }, + { + "completion_length": 1463.4911499023438, + "epoch": 0.8061581525542337, + "grad_norm": 3.4460787773132324, + "kl": 5.7265625, + "learning_rate": 2.1947781332695406e-06, + "loss": 0.2294, + "reward": 0.4114583432674408, + "reward_std": 0.35805678367614746, + "rewards/accuracy_reward": 0.06845238269306719, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3430059552192688, + "step": 288 + }, + { + "completion_length": 1449.8660888671875, + "epoch": 0.8089573128061581, + "grad_norm": 1.7255327701568604, + "kl": 5.40625, + "learning_rate": 2.133972295524875e-06, + "loss": 0.2163, + "reward": 0.4226190522313118, + "reward_std": 0.3677496537566185, + "rewards/accuracy_reward": 0.05952381086535752, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3630952462553978, + "step": 289 + }, + { + "completion_length": 1400.7113342285156, + "epoch": 0.8117564730580826, + "grad_norm": 2.2941792011260986, + "kl": 4.2734375, + "learning_rate": 2.073919885658223e-06, + "loss": 0.1708, + "reward": 0.3906250074505806, + "reward_std": 0.32365576177835464, + "rewards/accuracy_reward": 0.035714287078008056, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3549107238650322, + "step": 290 + }, + { + "completion_length": 1439.5803833007812, + "epoch": 0.814555633310007, + "grad_norm": 2.435476064682007, + "kl": 4.796875, + "learning_rate": 2.0146266556401405e-06, + "loss": 0.192, + "reward": 0.3727678656578064, + "reward_std": 0.29314519464969635, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3459821492433548, + "step": 291 + }, + { + "completion_length": 1361.0684814453125, + "epoch": 0.8173547935619314, + "grad_norm": 3.288947582244873, + "kl": 4.7109375, + "learning_rate": 1.956098284725031e-06, + "loss": 0.1886, + "reward": 0.3995535746216774, + "reward_std": 0.34060153365135193, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3549107238650322, + "step": 292 + }, + { + "completion_length": 1472.96728515625, + "epoch": 0.8201539538138558, + "grad_norm": 4.037724494934082, + "kl": 4.4296875, + "learning_rate": 1.898340378907172e-06, + "loss": 0.177, + "reward": 0.4285714328289032, + "reward_std": 0.36160846054553986, + "rewards/accuracy_reward": 0.05654762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3720238134264946, + "step": 293 + }, + { + "completion_length": 1345.7411193847656, + "epoch": 0.8229531140657803, + "grad_norm": 1.3143645524978638, + "kl": 5.0625, + "learning_rate": 1.8413584703837618e-06, + "loss": 0.2027, + "reward": 0.412202388048172, + "reward_std": 0.3475293517112732, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3675595298409462, + "step": 294 + }, + { + "completion_length": 1241.8512268066406, + "epoch": 0.8257522743177047, + "grad_norm": 2.2974655628204346, + "kl": 4.34765625, + "learning_rate": 1.7851580170250304e-06, + "loss": 0.1739, + "reward": 0.4330357238650322, + "reward_std": 0.345614917576313, + "rewards/accuracy_reward": 0.02976190554909408, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4032738208770752, + "step": 295 + }, + { + "completion_length": 1475.3660888671875, + "epoch": 0.8285514345696291, + "grad_norm": 10.724311828613281, + "kl": 6.9765625, + "learning_rate": 1.729744401851463e-06, + "loss": 0.2795, + "reward": 0.4136904776096344, + "reward_std": 0.3456144332885742, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3690476268529892, + "step": 296 + }, + { + "completion_length": 1376.4464416503906, + "epoch": 0.8313505948215535, + "grad_norm": 11.270524024963379, + "kl": 7.4921875, + "learning_rate": 1.6751229325182194e-06, + "loss": 0.2992, + "reward": 0.421875, + "reward_std": 0.3736114352941513, + "rewards/accuracy_reward": 0.05357142933644354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3683035746216774, + "step": 297 + }, + { + "completion_length": 1458.2619323730469, + "epoch": 0.834149755073478, + "grad_norm": 1.6163893938064575, + "kl": 4.96484375, + "learning_rate": 1.6212988408067354e-06, + "loss": 0.1985, + "reward": 0.4122023805975914, + "reward_std": 0.3508900851011276, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357164144516, + "step": 298 + }, + { + "completion_length": 1412.0714721679688, + "epoch": 0.8369489153254024, + "grad_norm": 5.344667911529541, + "kl": 5.6953125, + "learning_rate": 1.5682772821236192e-06, + "loss": 0.2277, + "reward": 0.4099702537059784, + "reward_std": 0.37672894448041916, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3474702462553978, + "step": 299 + }, + { + "completion_length": 1304.702392578125, + "epoch": 0.8397480755773268, + "grad_norm": 2.2289657592773438, + "kl": 4.14453125, + "learning_rate": 1.516063335006851e-06, + "loss": 0.1661, + "reward": 0.4494047686457634, + "reward_std": 0.36631667613983154, + "rewards/accuracy_reward": 0.05059523927047849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3988095298409462, + "step": 300 + }, + { + "completion_length": 1342.9880981445312, + "epoch": 0.8425472358292512, + "grad_norm": 4.20701265335083, + "kl": 4.8125, + "learning_rate": 1.4646620006393497e-06, + "loss": 0.1926, + "reward": 0.4776785895228386, + "reward_std": 0.36763861775398254, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4241071492433548, + "step": 301 + }, + { + "completion_length": 1246.5506286621094, + "epoch": 0.8453463960811757, + "grad_norm": 3.4226760864257812, + "kl": 3.50390625, + "learning_rate": 1.4140782023699396e-06, + "loss": 0.1403, + "reward": 0.449404776096344, + "reward_std": 0.35377900302410126, + "rewards/accuracy_reward": 0.06845238246023655, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.380952388048172, + "step": 302 + }, + { + "completion_length": 1194.27978515625, + "epoch": 0.8481455563331001, + "grad_norm": 3.308936834335327, + "kl": 3.59375, + "learning_rate": 1.3643167852417894e-06, + "loss": 0.1436, + "reward": 0.5215773954987526, + "reward_std": 0.3950745388865471, + "rewards/accuracy_reward": 0.09226190764456987, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.429315485060215, + "step": 303 + }, + { + "completion_length": 1386.0178833007812, + "epoch": 0.8509447165850245, + "grad_norm": 1.7825531959533691, + "kl": 5.11328125, + "learning_rate": 1.3153825155283395e-06, + "loss": 0.2046, + "reward": 0.4040178656578064, + "reward_std": 0.33551811426877975, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3683035746216774, + "step": 304 + }, + { + "completion_length": 1393.9791870117188, + "epoch": 0.853743876836949, + "grad_norm": 2.509859323501587, + "kl": 4.921875, + "learning_rate": 1.2672800802767715e-06, + "loss": 0.197, + "reward": 0.4382440596818924, + "reward_std": 0.34045761823654175, + "rewards/accuracy_reward": 0.04166666744276881, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396577388048172, + "step": 305 + }, + { + "completion_length": 1370.71728515625, + "epoch": 0.8565430370888734, + "grad_norm": 1.4187800884246826, + "kl": 4.78515625, + "learning_rate": 1.2200140868590759e-06, + "loss": 0.1912, + "reward": 0.4665178656578064, + "reward_std": 0.3823399096727371, + "rewards/accuracy_reward": 0.056547620333731174, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4099702462553978, + "step": 306 + }, + { + "completion_length": 1359.9583740234375, + "epoch": 0.8593421973407978, + "grad_norm": 3.4122722148895264, + "kl": 5.875, + "learning_rate": 1.1735890625307466e-06, + "loss": 0.2348, + "reward": 0.3898809626698494, + "reward_std": 0.3265804722905159, + "rewards/accuracy_reward": 0.020833333488553762, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3690476268529892, + "step": 307 + }, + { + "completion_length": 1379.1280212402344, + "epoch": 0.8621413575927221, + "grad_norm": 1.6806628704071045, + "kl": 4.80859375, + "learning_rate": 1.128009453997142e-06, + "loss": 0.1923, + "reward": 0.430803582072258, + "reward_std": 0.33175840973854065, + "rewards/accuracy_reward": 0.03571428591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3950892984867096, + "step": 308 + }, + { + "completion_length": 1318.3006286621094, + "epoch": 0.8649405178446467, + "grad_norm": 1.7152204513549805, + "kl": 4.5, + "learning_rate": 1.0832796269875757e-06, + "loss": 0.1798, + "reward": 0.4382440596818924, + "reward_std": 0.3369225934147835, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4114583432674408, + "step": 309 + }, + { + "completion_length": 1283.8125305175781, + "epoch": 0.867739678096571, + "grad_norm": 2.0499556064605713, + "kl": 5.0, + "learning_rate": 1.0394038658371575e-06, + "loss": 0.2, + "reward": 0.4523809626698494, + "reward_std": 0.35857996344566345, + "rewards/accuracy_reward": 0.05059523927047849, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4017857238650322, + "step": 310 + }, + { + "completion_length": 1273.2559814453125, + "epoch": 0.8705388383484954, + "grad_norm": 2.2152321338653564, + "kl": 5.3671875, + "learning_rate": 9.963863730764222e-07, + "loss": 0.2144, + "reward": 0.4322916716337204, + "reward_std": 0.3260413706302643, + "rewards/accuracy_reward": 0.038690477376803756, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.393601194024086, + "step": 311 + }, + { + "completion_length": 1418.8006286621094, + "epoch": 0.8733379986004198, + "grad_norm": 3.0493364334106445, + "kl": 5.21875, + "learning_rate": 9.542312690288035e-07, + "loss": 0.2089, + "reward": 0.4427083358168602, + "reward_std": 0.3837737664580345, + "rewards/accuracy_reward": 0.06547619169577956, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3772321492433548, + "step": 312 + }, + { + "completion_length": 1358.0744323730469, + "epoch": 0.8761371588523443, + "grad_norm": 1.1914238929748535, + "kl": 4.921875, + "learning_rate": 9.129425914159839e-07, + "loss": 0.1969, + "reward": 0.446428582072258, + "reward_std": 0.36544618755578995, + "rewards/accuracy_reward": 0.05059524020180106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3958333432674408, + "step": 313 + }, + { + "completion_length": 1379.3125, + "epoch": 0.8789363191042687, + "grad_norm": 1.5079797506332397, + "kl": 5.2265625, + "learning_rate": 8.725242949711376e-07, + "loss": 0.2092, + "reward": 0.4531250149011612, + "reward_std": 0.3662826642394066, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4114583432674408, + "step": 314 + }, + { + "completion_length": 1399.7411193847656, + "epoch": 0.8817354793561931, + "grad_norm": 2.1089096069335938, + "kl": 5.5390625, + "learning_rate": 8.329802510601559e-07, + "loss": 0.2214, + "reward": 0.4002976268529892, + "reward_std": 0.3186268284916878, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.373511902987957, + "step": 315 + }, + { + "completion_length": 1410.8958740234375, + "epoch": 0.8845346396081175, + "grad_norm": 2.0428688526153564, + "kl": 5.296875, + "learning_rate": 7.943142473108234e-07, + "loss": 0.2116, + "reward": 0.4188988283276558, + "reward_std": 0.3568192198872566, + "rewards/accuracy_reward": 0.05952381179668009, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3593750074505806, + "step": 316 + }, + { + "completion_length": 1339.77978515625, + "epoch": 0.887333799860042, + "grad_norm": 2.3376100063323975, + "kl": 4.58203125, + "learning_rate": 7.565299872500331e-07, + "loss": 0.1834, + "reward": 0.4300595372915268, + "reward_std": 0.3629925549030304, + "rewards/accuracy_reward": 0.050595239736139774, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3794642984867096, + "step": 317 + }, + { + "completion_length": 1309.8809814453125, + "epoch": 0.8901329601119664, + "grad_norm": 3.16902494430542, + "kl": 4.140625, + "learning_rate": 7.196310899490577e-07, + "loss": 0.1654, + "reward": 0.4211309626698494, + "reward_std": 0.3569025695323944, + "rewards/accuracy_reward": 0.0505952388048172, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357164144516, + "step": 318 + }, + { + "completion_length": 1399.6161193847656, + "epoch": 0.8929321203638908, + "grad_norm": 2.646667003631592, + "kl": 4.8046875, + "learning_rate": 6.836210896769014e-07, + "loss": 0.192, + "reward": 0.395089291036129, + "reward_std": 0.31383977830410004, + "rewards/accuracy_reward": 0.014880952425301075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3802083432674408, + "step": 319 + }, + { + "completion_length": 1362.9167175292969, + "epoch": 0.8957312806158153, + "grad_norm": 3.6319262981414795, + "kl": 4.0859375, + "learning_rate": 6.485034355617748e-07, + "loss": 0.1636, + "reward": 0.443452388048172, + "reward_std": 0.35718773305416107, + "rewards/accuracy_reward": 0.05059523903764784, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3928571492433548, + "step": 320 + }, + { + "completion_length": 1361.71728515625, + "epoch": 0.8985304408677397, + "grad_norm": 3.302518844604492, + "kl": 4.546875, + "learning_rate": 6.142814912607409e-07, + "loss": 0.1818, + "reward": 0.4479166716337204, + "reward_std": 0.36938507854938507, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4032738208770752, + "step": 321 + }, + { + "completion_length": 1297.4255981445312, + "epoch": 0.9013296011196641, + "grad_norm": 2.2909963130950928, + "kl": 4.55078125, + "learning_rate": 5.809585346375235e-07, + "loss": 0.182, + "reward": 0.395089291036129, + "reward_std": 0.3037964403629303, + "rewards/accuracy_reward": 0.011904762126505375, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3831845298409462, + "step": 322 + }, + { + "completion_length": 1392.0000305175781, + "epoch": 0.9041287613715885, + "grad_norm": 1.4405887126922607, + "kl": 4.8671875, + "learning_rate": 5.485377574485528e-07, + "loss": 0.1947, + "reward": 0.409226194024086, + "reward_std": 0.3471325859427452, + "rewards/accuracy_reward": 0.04166666720993817, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3675595298409462, + "step": 323 + }, + { + "completion_length": 1483.6488342285156, + "epoch": 0.906927921623513, + "grad_norm": 1.6041861772537231, + "kl": 4.7265625, + "learning_rate": 5.17022265037247e-07, + "loss": 0.189, + "reward": 0.4159226268529892, + "reward_std": 0.32322467491030693, + "rewards/accuracy_reward": 0.03571428591385484, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3802083358168602, + "step": 324 + }, + { + "completion_length": 1420.0684814453125, + "epoch": 0.9097270818754374, + "grad_norm": 1.5601868629455566, + "kl": 5.1875, + "learning_rate": 4.864150760365771e-07, + "loss": 0.2075, + "reward": 0.3965773805975914, + "reward_std": 0.30500587075948715, + "rewards/accuracy_reward": 0.017857143189758062, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3787202462553978, + "step": 325 + }, + { + "completion_length": 1382.6518249511719, + "epoch": 0.9125262421273618, + "grad_norm": 3.680711269378662, + "kl": 5.9609375, + "learning_rate": 4.567191220799305e-07, + "loss": 0.2385, + "reward": 0.4084821492433548, + "reward_std": 0.33489790558815, + "rewards/accuracy_reward": 0.029761905781924725, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3787202462553978, + "step": 326 + }, + { + "completion_length": 1364.1607055664062, + "epoch": 0.9153254023792862, + "grad_norm": 3.90324330329895, + "kl": 5.9140625, + "learning_rate": 4.2793724752031807e-07, + "loss": 0.2367, + "reward": 0.3869047686457634, + "reward_std": 0.3064044490456581, + "rewards/accuracy_reward": 0.02380952425301075, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3630952462553978, + "step": 327 + }, + { + "completion_length": 1460.7887268066406, + "epoch": 0.9181245626312107, + "grad_norm": 3.9847612380981445, + "kl": 4.9296875, + "learning_rate": 4.000722091579301e-07, + "loss": 0.1974, + "reward": 0.3928571566939354, + "reward_std": 0.29354726523160934, + "rewards/accuracy_reward": 0.017857143422588706, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3750000074505806, + "step": 328 + }, + { + "completion_length": 1374.6875305175781, + "epoch": 0.9209237228831351, + "grad_norm": 1.0949150323867798, + "kl": 4.45703125, + "learning_rate": 3.731266759760854e-07, + "loss": 0.1784, + "reward": 0.4977678656578064, + "reward_std": 0.41033344715833664, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4084821566939354, + "step": 329 + }, + { + "completion_length": 1325.15478515625, + "epoch": 0.9237228831350595, + "grad_norm": 1.7551958560943604, + "kl": 5.015625, + "learning_rate": 3.471032288855869e-07, + "loss": 0.2006, + "reward": 0.4285714328289032, + "reward_std": 0.3321956619620323, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4017857164144516, + "step": 330 + }, + { + "completion_length": 1407.1607666015625, + "epoch": 0.9265220433869839, + "grad_norm": 2.623478412628174, + "kl": 4.953125, + "learning_rate": 3.2200436047752026e-07, + "loss": 0.1982, + "reward": 0.443452388048172, + "reward_std": 0.3565198704600334, + "rewards/accuracy_reward": 0.04761904873885214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3958333358168602, + "step": 331 + }, + { + "completion_length": 1360.5000305175781, + "epoch": 0.9293212036389084, + "grad_norm": 3.1613452434539795, + "kl": 5.4609375, + "learning_rate": 2.978324747844996e-07, + "loss": 0.2181, + "reward": 0.4412202462553978, + "reward_std": 0.3965253308415413, + "rewards/accuracy_reward": 0.0803571455180645, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3608631044626236, + "step": 332 + }, + { + "completion_length": 1323.8601379394531, + "epoch": 0.9321203638908327, + "grad_norm": 1.4278522729873657, + "kl": 4.68359375, + "learning_rate": 2.745898870504116e-07, + "loss": 0.1871, + "reward": 0.429315485060215, + "reward_std": 0.3351411744952202, + "rewards/accuracy_reward": 0.04166666814126074, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3876488134264946, + "step": 333 + }, + { + "completion_length": 1216.2262268066406, + "epoch": 0.9349195241427571, + "grad_norm": 3.5842936038970947, + "kl": 4.09375, + "learning_rate": 2.5227882350865154e-07, + "loss": 0.1639, + "reward": 0.4486607164144516, + "reward_std": 0.3199794441461563, + "rewards/accuracy_reward": 0.023809524485841393, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4248512014746666, + "step": 334 + }, + { + "completion_length": 1375.0744323730469, + "epoch": 0.9377186843946816, + "grad_norm": 0.9856395125389099, + "kl": 4.546875, + "learning_rate": 2.309014211688865e-07, + "loss": 0.1818, + "reward": 0.4657738208770752, + "reward_std": 0.38036076724529266, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4062500149011612, + "step": 335 + }, + { + "completion_length": 1259.7470397949219, + "epoch": 0.940517844646606, + "grad_norm": 2.5921571254730225, + "kl": 4.28515625, + "learning_rate": 2.104597276123721e-07, + "loss": 0.1716, + "reward": 0.4508928656578064, + "reward_std": 0.3156162351369858, + "rewards/accuracy_reward": 0.03571428661234677, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.415178582072258, + "step": 336 + }, + { + "completion_length": 1216.9196472167969, + "epoch": 0.9433170048985304, + "grad_norm": 1.2123003005981445, + "kl": 4.6171875, + "learning_rate": 1.909557007958307e-07, + "loss": 0.1845, + "reward": 0.4456845298409462, + "reward_std": 0.3478916212916374, + "rewards/accuracy_reward": 0.059523810632526875, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3861607164144516, + "step": 337 + }, + { + "completion_length": 1312.4881286621094, + "epoch": 0.9461161651504548, + "grad_norm": 1.5337111949920654, + "kl": 4.96875, + "learning_rate": 1.7239120886390347e-07, + "loss": 0.199, + "reward": 0.4270833358168602, + "reward_std": 0.33733995258808136, + "rewards/accuracy_reward": 0.05357142933644354, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3735119178891182, + "step": 338 + }, + { + "completion_length": 1330.264892578125, + "epoch": 0.9489153254023793, + "grad_norm": 1.6741083860397339, + "kl": 4.17578125, + "learning_rate": 1.5476802997022812e-07, + "loss": 0.1672, + "reward": 0.443452388048172, + "reward_std": 0.34243104606866837, + "rewards/accuracy_reward": 0.03571428614668548, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4077381119132042, + "step": 339 + }, + { + "completion_length": 1373.5535888671875, + "epoch": 0.9517144856543037, + "grad_norm": 2.481156349182129, + "kl": 5.4375, + "learning_rate": 1.3808785210711606e-07, + "loss": 0.2177, + "reward": 0.4382440596818924, + "reward_std": 0.345366507768631, + "rewards/accuracy_reward": 0.07738095545209944, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3608631044626236, + "step": 340 + }, + { + "completion_length": 1309.1607360839844, + "epoch": 0.9545136459062281, + "grad_norm": 1.6094251871109009, + "kl": 4.78125, + "learning_rate": 1.2235227294387085e-07, + "loss": 0.1915, + "reward": 0.430803582072258, + "reward_std": 0.33838749676942825, + "rewards/accuracy_reward": 0.04761904873885214, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3831845298409462, + "step": 341 + }, + { + "completion_length": 1301.107177734375, + "epoch": 0.9573128061581525, + "grad_norm": 1.3266340494155884, + "kl": 4.453125, + "learning_rate": 1.075627996737627e-07, + "loss": 0.178, + "reward": 0.4389881044626236, + "reward_std": 0.3299376741051674, + "rewards/accuracy_reward": 0.03869047714397311, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4002976343035698, + "step": 342 + }, + { + "completion_length": 1413.6339416503906, + "epoch": 0.960111966410077, + "grad_norm": 1.5120221376419067, + "kl": 4.8359375, + "learning_rate": 9.372084886966392e-08, + "loss": 0.1937, + "reward": 0.3973214402794838, + "reward_std": 0.3036072328686714, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3705357164144516, + "step": 343 + }, + { + "completion_length": 1272.34228515625, + "epoch": 0.9629111266620014, + "grad_norm": 0.996314525604248, + "kl": 4.765625, + "learning_rate": 8.082774634836754e-08, + "loss": 0.1903, + "reward": 0.426339291036129, + "reward_std": 0.36217150837183, + "rewards/accuracy_reward": 0.038690477376803756, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3876488208770752, + "step": 344 + }, + { + "completion_length": 1278.4583435058594, + "epoch": 0.9657102869139258, + "grad_norm": 1.207736611366272, + "kl": 4.5859375, + "learning_rate": 6.888472704359661e-08, + "loss": 0.1837, + "reward": 0.5007440596818924, + "reward_std": 0.3912041410803795, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.396577388048172, + "step": 345 + }, + { + "completion_length": 1424.672607421875, + "epoch": 0.9685094471658502, + "grad_norm": 1.443676471710205, + "kl": 5.015625, + "learning_rate": 5.7892934887717746e-08, + "loss": 0.2006, + "reward": 0.4166666716337204, + "reward_std": 0.34780431538820267, + "rewards/accuracy_reward": 0.05059523903764784, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3660714402794838, + "step": 346 + }, + { + "completion_length": 1192.8810119628906, + "epoch": 0.9713086074177747, + "grad_norm": 1.0321071147918701, + "kl": 4.84375, + "learning_rate": 4.785342270217319e-08, + "loss": 0.1938, + "reward": 0.4367559626698494, + "reward_std": 0.3517054095864296, + "rewards/accuracy_reward": 0.041666666977107525, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3950892984867096, + "step": 347 + }, + { + "completion_length": 1383.0089416503906, + "epoch": 0.9741077676696991, + "grad_norm": 1.094332218170166, + "kl": 4.890625, + "learning_rate": 3.8767152096641504e-08, + "loss": 0.1957, + "reward": 0.4248511865735054, + "reward_std": 0.35158083587884903, + "rewards/accuracy_reward": 0.044642857974395156, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3802083432674408, + "step": 348 + }, + { + "completion_length": 1428.9166870117188, + "epoch": 0.9769069279216235, + "grad_norm": 1.5254340171813965, + "kl": 4.7421875, + "learning_rate": 3.063499337692788e-08, + "loss": 0.19, + "reward": 0.444940485060215, + "reward_std": 0.3674090802669525, + "rewards/accuracy_reward": 0.05059524020180106, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3943452462553978, + "step": 349 + }, + { + "completion_length": 1357.2202758789062, + "epoch": 0.979706088173548, + "grad_norm": 1.4848238229751587, + "kl": 5.40625, + "learning_rate": 2.3457725461607518e-08, + "loss": 0.2161, + "reward": 0.395089291036129, + "reward_std": 0.3104281648993492, + "rewards/accuracy_reward": 0.020833333721384406, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3742559626698494, + "step": 350 + }, + { + "completion_length": 1225.6339416503906, + "epoch": 0.9825052484254724, + "grad_norm": 1.3038214445114136, + "kl": 4.8984375, + "learning_rate": 1.7236035807416397e-08, + "loss": 0.1959, + "reward": 0.4546131044626236, + "reward_std": 0.33693618327379227, + "rewards/accuracy_reward": 0.04464285774156451, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4099702462553978, + "step": 351 + }, + { + "completion_length": 1316.8541564941406, + "epoch": 0.9853044086773968, + "grad_norm": 2.081798791885376, + "kl": 5.21875, + "learning_rate": 1.1970520343408398e-08, + "loss": 0.2088, + "reward": 0.3787202462553978, + "reward_std": 0.30557170510292053, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3578869104385376, + "step": 352 + }, + { + "completion_length": 1275.514892578125, + "epoch": 0.9881035689293212, + "grad_norm": 2.5191140174865723, + "kl": 5.05078125, + "learning_rate": 7.661683413868748e-09, + "loss": 0.202, + "reward": 0.4523809552192688, + "reward_std": 0.3754509463906288, + "rewards/accuracy_reward": 0.06250000116415322, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3898809626698494, + "step": 353 + }, + { + "completion_length": 1411.5000305175781, + "epoch": 0.9909027291812457, + "grad_norm": 2.2324330806732178, + "kl": 5.234375, + "learning_rate": 4.309937730015978e-09, + "loss": 0.209, + "reward": 0.3943452462553978, + "reward_std": 0.3194137141108513, + "rewards/accuracy_reward": 0.032738096080720425, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3616071492433548, + "step": 354 + }, + { + "completion_length": 1238.8184814453125, + "epoch": 0.9937018894331701, + "grad_norm": 3.0568997859954834, + "kl": 4.3671875, + "learning_rate": 1.915604330464671e-09, + "loss": 0.1749, + "reward": 0.443452388048172, + "reward_std": 0.3478512540459633, + "rewards/accuracy_reward": 0.026785714784637094, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4166666716337204, + "step": 355 + }, + { + "completion_length": 1400.2470397949219, + "epoch": 0.9965010496850945, + "grad_norm": 1.7922987937927246, + "kl": 4.5078125, + "learning_rate": 4.789125504778281e-10, + "loss": 0.1803, + "reward": 0.454613097012043, + "reward_std": 0.3549215570092201, + "rewards/accuracy_reward": 0.04761904897168279, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.4069940522313118, + "step": 356 + }, + { + "completion_length": 1322.1012268066406, + "epoch": 0.9993002099370188, + "grad_norm": 1.4842438697814941, + "kl": 4.9765625, + "learning_rate": 0.0, + "loss": 0.1989, + "reward": 0.4471726268529892, + "reward_std": 0.3474733680486679, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.0, + "rewards/tag_count_reward": 0.3846726194024086, + "step": 357 + }, + { + "epoch": 0.9993002099370188, + "step": 357, "total_flos": 0.0, - "train_loss": 0.002700890247054666, - "train_runtime": 4286.7009, - "train_samples_per_second": 0.233, - "train_steps_per_second": 0.008 + "train_loss": 0.14923175725754198, + "train_runtime": 53017.8845, + "train_samples_per_second": 0.189, + "train_steps_per_second": 0.007 } ], "logging_steps": 1, - "max_steps": 35, + "max_steps": 357, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500,