{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 894.0833435058594, "epoch": 0.0021333333333333334, "grad_norm": 0.17310986830806582, "kl": 0.0, "learning_rate": 2.127659574468085e-08, "loss": 0.0183, "reward": 0.7500000298023224, "reward_std": 0.3813292533159256, "rewards/accuracy_reward": 0.3125000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000149011612, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 880.7187652587891, "epoch": 0.004266666666666667, "grad_norm": 0.16276198736608005, "kl": 0.0, "learning_rate": 4.25531914893617e-08, "loss": 0.0314, "reward": 0.7526041865348816, "reward_std": 0.3560462072491646, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.440104179084301, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 871.1354370117188, "epoch": 0.0064, "grad_norm": 0.1506266815932542, "kl": 2.1219253540039062e-05, "learning_rate": 6.382978723404254e-08, "loss": 0.0353, "reward": 0.6302083432674408, "reward_std": 0.3013310767710209, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4010416716337204, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 942.0937652587891, "epoch": 0.008533333333333334, "grad_norm": 0.16478813088653557, "kl": 1.8753111362457275e-05, "learning_rate": 8.51063829787234e-08, "loss": 0.0317, "reward": 0.6119791865348816, "reward_std": 0.3484013229608536, "rewards/accuracy_reward": 0.2187500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3932291716337204, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 843.8541870117188, "epoch": 0.010666666666666666, "grad_norm": 0.20469530540041506, "kl": 2.9385089874267578e-05, "learning_rate": 1.0638297872340425e-07, "loss": 0.0553, "reward": 0.7187500223517418, "reward_std": 0.3990408657118678, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833432674408, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 878.7395935058594, "epoch": 0.0128, "grad_norm": 0.14039871090608422, "kl": 1.640617847442627e-05, "learning_rate": 1.2765957446808508e-07, "loss": 0.0399, "reward": 0.6562500149011612, "reward_std": 0.29901912435889244, "rewards/accuracy_reward": 0.22916667349636555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833507180214, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 846.1250305175781, "epoch": 0.014933333333333333, "grad_norm": 0.19580219393645013, "kl": 2.7954578399658203e-05, "learning_rate": 1.4893617021276595e-07, "loss": 0.0596, "reward": 0.6927083432674408, "reward_std": 0.318842139095068, "rewards/accuracy_reward": 0.23958333861082792, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250074505806, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 868.0833435058594, "epoch": 0.017066666666666667, "grad_norm": 0.1646324395539006, "kl": 3.281235694885254e-05, "learning_rate": 1.702127659574468e-07, "loss": 0.0436, "reward": 0.5729166865348816, "reward_std": 0.2931280732154846, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3854166716337204, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 885.0729370117188, "epoch": 0.0192, "grad_norm": 0.17173680857220677, "kl": 2.326071262359619e-05, "learning_rate": 1.9148936170212765e-07, "loss": 0.0606, "reward": 0.6822916865348816, "reward_std": 0.38235192000865936, "rewards/accuracy_reward": 0.2604166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750074505806, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 854.9166870117188, "epoch": 0.021333333333333333, "grad_norm": 0.17593762669139573, "kl": 2.586841583251953e-05, "learning_rate": 2.127659574468085e-07, "loss": 0.0259, "reward": 0.6536458432674408, "reward_std": 0.29265937581658363, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424479179084301, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 892.1979370117188, "epoch": 0.023466666666666667, "grad_norm": 0.181843057081716, "kl": 4.273653030395508e-05, "learning_rate": 2.3404255319148937e-07, "loss": 0.0514, "reward": 0.6145833507180214, "reward_std": 0.26475396007299423, "rewards/accuracy_reward": 0.22916667256504297, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.385416679084301, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 837.8333587646484, "epoch": 0.0256, "grad_norm": 0.20275531176806552, "kl": 2.709031105041504e-05, "learning_rate": 2.5531914893617016e-07, "loss": 0.0699, "reward": 0.7473958432674408, "reward_std": 0.3608727604150772, "rewards/accuracy_reward": 0.2916666669771075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.455729179084301, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 766.9271087646484, "epoch": 0.027733333333333332, "grad_norm": 0.1848812468936893, "kl": 2.6345252990722656e-05, "learning_rate": 2.7659574468085106e-07, "loss": 0.0275, "reward": 0.7916666865348816, "reward_std": 0.3341744616627693, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4583333358168602, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 894.1041870117188, "epoch": 0.029866666666666666, "grad_norm": 0.1657265791258112, "kl": 2.9712915420532227e-05, "learning_rate": 2.978723404255319e-07, "loss": 0.0513, "reward": 0.708333358168602, "reward_std": 0.3291201740503311, "rewards/accuracy_reward": 0.28125001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833432674408, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 935.7708587646484, "epoch": 0.032, "grad_norm": 0.1759764964363314, "kl": 3.552436828613281e-05, "learning_rate": 3.1914893617021275e-07, "loss": 0.0254, "reward": 0.5625000149011612, "reward_std": 0.3009165897965431, "rewards/accuracy_reward": 0.1979166753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3645833432674408, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 889.5000152587891, "epoch": 0.034133333333333335, "grad_norm": 0.20036893134554598, "kl": 3.218650817871094e-05, "learning_rate": 3.404255319148936e-07, "loss": 0.0663, "reward": 0.6848958432674408, "reward_std": 0.357659300789237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4348958432674408, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 849.3958435058594, "epoch": 0.03626666666666667, "grad_norm": 0.1872151772588199, "kl": 2.2232532501220703e-05, "learning_rate": 3.617021276595745e-07, "loss": 0.0471, "reward": 0.7500000298023224, "reward_std": 0.30543046444654465, "rewards/accuracy_reward": 0.3437500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500074505806, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 856.1562652587891, "epoch": 0.0384, "grad_norm": 0.20516969145364147, "kl": 2.6524066925048828e-05, "learning_rate": 3.829787234042553e-07, "loss": 0.0658, "reward": 0.8046875149011612, "reward_std": 0.3947491720318794, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875149011612, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 930.3020935058594, "epoch": 0.04053333333333333, "grad_norm": 0.16862112578032326, "kl": 2.0489096641540527e-05, "learning_rate": 4.0425531914893614e-07, "loss": 0.0448, "reward": 0.5468750223517418, "reward_std": 0.24587241373956203, "rewards/accuracy_reward": 0.1770833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.369791679084301, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 931.0000305175781, "epoch": 0.042666666666666665, "grad_norm": 0.14505532791974818, "kl": 2.5272369384765625e-05, "learning_rate": 4.25531914893617e-07, "loss": 0.0042, "reward": 0.4791666716337204, "reward_std": 0.20529046654701233, "rewards/accuracy_reward": 0.13541666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3437500074505806, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 894.1875305175781, "epoch": 0.0448, "grad_norm": 0.21590798838856656, "kl": 2.902001142501831e-05, "learning_rate": 4.4680851063829783e-07, "loss": 0.0544, "reward": 0.5390625149011612, "reward_std": 0.3045891672372818, "rewards/accuracy_reward": 0.16666667442768812, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3723958358168602, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 929.4687652587891, "epoch": 0.046933333333333334, "grad_norm": 0.1446020405000745, "kl": 2.872943878173828e-05, "learning_rate": 4.6808510638297873e-07, "loss": 0.0354, "reward": 0.5885416865348816, "reward_std": 0.24042147770524025, "rewards/accuracy_reward": 0.21875001024454832, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3697916716337204, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 951.7500152587891, "epoch": 0.04906666666666667, "grad_norm": 0.1645371600886353, "kl": 2.2001564502716064e-05, "learning_rate": 4.893617021276595e-07, "loss": 0.0339, "reward": 0.5182291865348816, "reward_std": 0.2648882642388344, "rewards/accuracy_reward": 0.14583333674818277, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3723958432674408, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 944.8750152587891, "epoch": 0.0512, "grad_norm": 0.16476425166098713, "kl": 1.712888479232788e-05, "learning_rate": 5.106382978723403e-07, "loss": 0.0178, "reward": 0.5338541865348816, "reward_std": 0.2086249254643917, "rewards/accuracy_reward": 0.1770833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3567708432674408, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 912.0521087646484, "epoch": 0.05333333333333334, "grad_norm": 0.15949749914052028, "kl": 1.029670238494873e-05, "learning_rate": 5.319148936170212e-07, "loss": 0.0359, "reward": 0.5807291865348816, "reward_std": 0.23926915973424911, "rewards/accuracy_reward": 0.19791667442768812, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 948.7500152587891, "epoch": 0.055466666666666664, "grad_norm": 0.11874893433162777, "kl": 2.4452805519104004e-05, "learning_rate": 5.531914893617021e-07, "loss": 0.0124, "reward": 0.510416679084301, "reward_std": 0.18505185097455978, "rewards/accuracy_reward": 0.1770833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3333333358168602, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 802.9687805175781, "epoch": 0.0576, "grad_norm": 0.22329831650935772, "kl": 2.3111701011657715e-05, "learning_rate": 5.74468085106383e-07, "loss": 0.0836, "reward": 0.809895858168602, "reward_std": 0.4666432961821556, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625074505806, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 930.2187652587891, "epoch": 0.05973333333333333, "grad_norm": 0.13098037690492897, "kl": 2.3573637008666992e-05, "learning_rate": 5.957446808510638e-07, "loss": 0.0258, "reward": 0.6250000149011612, "reward_std": 0.2079470045864582, "rewards/accuracy_reward": 0.2395833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.385416679084301, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 919.5625305175781, "epoch": 0.06186666666666667, "grad_norm": 0.15705368765533165, "kl": 3.425776958465576e-05, "learning_rate": 6.170212765957446e-07, "loss": 0.0226, "reward": 0.5833333432674408, "reward_std": 0.3197858855128288, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3958333507180214, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 838.4271087646484, "epoch": 0.064, "grad_norm": 0.20855681863309491, "kl": 3.6150217056274414e-05, "learning_rate": 6.382978723404255e-07, "loss": 0.0557, "reward": 0.6380208507180214, "reward_std": 0.3100079074501991, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4192708432674408, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 862.4271087646484, "epoch": 0.06613333333333334, "grad_norm": 0.19570403843889805, "kl": 6.431341171264648e-05, "learning_rate": 6.595744680851063e-07, "loss": 0.0654, "reward": 0.6666666865348816, "reward_std": 0.23266133293509483, "rewards/accuracy_reward": 0.23958334606140852, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833432674408, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 840.6041870117188, "epoch": 0.06826666666666667, "grad_norm": 0.1847871279034566, "kl": 6.045214831829071e-05, "learning_rate": 6.808510638297872e-07, "loss": 0.0659, "reward": 0.6015625223517418, "reward_std": 0.33919696137309074, "rewards/accuracy_reward": 0.19791667349636555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4036458432674408, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 903.1250305175781, "epoch": 0.0704, "grad_norm": 0.1688618611852386, "kl": 5.7756900787353516e-05, "learning_rate": 7.021276595744681e-07, "loss": 0.0359, "reward": 0.6458333432674408, "reward_std": 0.3414671868085861, "rewards/accuracy_reward": 0.2395833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500074505806, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 827.0729217529297, "epoch": 0.07253333333333334, "grad_norm": 0.18227015395901028, "kl": 0.00010900944471359253, "learning_rate": 7.23404255319149e-07, "loss": 0.0124, "reward": 0.8307292014360428, "reward_std": 0.3232859745621681, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4244791716337204, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 879.5729370117188, "epoch": 0.07466666666666667, "grad_norm": 0.18233962099030362, "kl": 0.00015303492546081543, "learning_rate": 7.446808510638297e-07, "loss": 0.0579, "reward": 0.6328125298023224, "reward_std": 0.20242082886397839, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424479179084301, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 833.0000152587891, "epoch": 0.0768, "grad_norm": 0.21846869996465385, "kl": 0.00018644332885742188, "learning_rate": 7.659574468085106e-07, "loss": 0.058, "reward": 0.8697916865348816, "reward_std": 0.44036802649497986, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463541679084301, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 884.8750305175781, "epoch": 0.07893333333333333, "grad_norm": 0.18403339762716192, "kl": 0.00013756752014160156, "learning_rate": 7.872340425531915e-07, "loss": 0.0163, "reward": 0.5911458656191826, "reward_std": 0.30511896684765816, "rewards/accuracy_reward": 0.20833334233611822, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3828125149011612, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 837.1562805175781, "epoch": 0.08106666666666666, "grad_norm": 0.19659938761910423, "kl": 0.00023549795150756836, "learning_rate": 8.085106382978723e-07, "loss": 0.0552, "reward": 0.7369792014360428, "reward_std": 0.3835315965116024, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424479179084301, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 864.0416870117188, "epoch": 0.0832, "grad_norm": 0.17026366658290032, "kl": 0.0002574920654296875, "learning_rate": 8.297872340425532e-07, "loss": 0.042, "reward": 0.658854179084301, "reward_std": 0.22663411498069763, "rewards/accuracy_reward": 0.2604166669771075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375074505806, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 862.2083435058594, "epoch": 0.08533333333333333, "grad_norm": 0.15525233934854948, "kl": 0.00037288665771484375, "learning_rate": 8.51063829787234e-07, "loss": 0.0344, "reward": 0.7239583432674408, "reward_std": 0.21023957571014762, "rewards/accuracy_reward": 0.30208333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750149011612, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 890.6354370117188, "epoch": 0.08746666666666666, "grad_norm": 0.15378582910420188, "kl": 0.00021708011627197266, "learning_rate": 8.723404255319149e-07, "loss": 0.0307, "reward": 0.6093750149011612, "reward_std": 0.20325927063822746, "rewards/accuracy_reward": 0.21875000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250149011612, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 721.0104217529297, "epoch": 0.0896, "grad_norm": 0.25866548106788456, "kl": 0.0007886886596679688, "learning_rate": 8.936170212765957e-07, "loss": 0.0465, "reward": 0.9531250298023224, "reward_std": 0.4038851633667946, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5052083507180214, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 864.9687652587891, "epoch": 0.09173333333333333, "grad_norm": 0.1593135664938312, "kl": 0.0002856254577636719, "learning_rate": 9.148936170212766e-07, "loss": 0.0397, "reward": 0.7630208432674408, "reward_std": 0.24727017246186733, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875149011612, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 798.4791870117188, "epoch": 0.09386666666666667, "grad_norm": 0.3075627551417041, "kl": 0.0007753372192382812, "learning_rate": 9.361702127659575e-07, "loss": 0.0926, "reward": 0.8359375149011612, "reward_std": 0.4430364593863487, "rewards/accuracy_reward": 0.33333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5026041865348816, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 857.8437652587891, "epoch": 0.096, "grad_norm": 0.16498643701657156, "kl": 0.0007944107055664062, "learning_rate": 9.574468085106384e-07, "loss": 0.0198, "reward": 0.6614583507180214, "reward_std": 0.2478529755026102, "rewards/accuracy_reward": 0.2708333423361182, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3906250074505806, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 841.1562652587891, "epoch": 0.09813333333333334, "grad_norm": 0.18056566439161278, "kl": 0.00116729736328125, "learning_rate": 9.78723404255319e-07, "loss": 0.0172, "reward": 0.5911458432674408, "reward_std": 0.24517233669757843, "rewards/accuracy_reward": 0.1770833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625149011612, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 886.1562652587891, "epoch": 0.10026666666666667, "grad_norm": 0.1970736501400628, "kl": 0.0014438629150390625, "learning_rate": 1e-06, "loss": 0.0498, "reward": 0.7135416716337204, "reward_std": 0.26464908197522163, "rewards/accuracy_reward": 0.3333333497866988, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3802083507180214, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 890.9375152587891, "epoch": 0.1024, "grad_norm": 0.1944077299260527, "kl": 0.00160980224609375, "learning_rate": 9.999874710101751e-07, "loss": 0.0525, "reward": 0.692708358168602, "reward_std": 0.35645777732133865, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.401041679084301, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 858.7604217529297, "epoch": 0.10453333333333334, "grad_norm": 0.20748565591002802, "kl": 0.0023660659790039062, "learning_rate": 9.999498847383701e-07, "loss": 0.0501, "reward": 0.8203125149011612, "reward_std": 0.38833674043416977, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4244791716337204, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 749.6041870117188, "epoch": 0.10666666666666667, "grad_norm": 0.225112305188479, "kl": 0.0024404525756835938, "learning_rate": 9.998872432775536e-07, "loss": 0.0557, "reward": 1.0703125447034836, "reward_std": 0.3621439263224602, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4661458358168602, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 845.7083740234375, "epoch": 0.1088, "grad_norm": 0.19286371886262563, "kl": 0.0018463134765625, "learning_rate": 9.99799550115878e-07, "loss": 0.0212, "reward": 0.7526041939854622, "reward_std": 0.3729289174079895, "rewards/accuracy_reward": 0.3541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375149011612, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 806.3854370117188, "epoch": 0.11093333333333333, "grad_norm": 0.23134579794812155, "kl": 0.0025386810302734375, "learning_rate": 9.99686810136484e-07, "loss": 0.0694, "reward": 0.817708358168602, "reward_std": 0.34440432488918304, "rewards/accuracy_reward": 0.3645833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250149011612, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 844.3437652587891, "epoch": 0.11306666666666666, "grad_norm": 0.17791438000124069, "kl": 0.00243377685546875, "learning_rate": 9.995490296172302e-07, "loss": 0.0348, "reward": 0.7968750298023224, "reward_std": 0.26974398642778397, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4010416865348816, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 910.5208435058594, "epoch": 0.1152, "grad_norm": 0.19770122402586285, "kl": 0.0022225379943847656, "learning_rate": 9.993862162303412e-07, "loss": 0.0511, "reward": 0.7473958432674408, "reward_std": 0.2851286958903074, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.393229179084301, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 890.2396087646484, "epoch": 0.11733333333333333, "grad_norm": 0.22500934978331336, "kl": 0.0024385452270507812, "learning_rate": 9.991983790419832e-07, "loss": 0.053, "reward": 0.653645858168602, "reward_std": 0.2874385491013527, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4036458432674408, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 872.7396087646484, "epoch": 0.11946666666666667, "grad_norm": 0.18421634404374504, "kl": 0.003376007080078125, "learning_rate": 9.989855285117573e-07, "loss": 0.0606, "reward": 0.7187500298023224, "reward_std": 0.271013580262661, "rewards/accuracy_reward": 0.3229166781529784, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3958333432674408, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 815.7083587646484, "epoch": 0.1216, "grad_norm": 0.22992129396312608, "kl": 0.003387451171875, "learning_rate": 9.98747676492117e-07, "loss": 0.0465, "reward": 0.8151042014360428, "reward_std": 0.38903436064720154, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.440104179084301, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 790.6875152587891, "epoch": 0.12373333333333333, "grad_norm": 0.18867270309064404, "kl": 0.00368499755859375, "learning_rate": 9.984848362277092e-07, "loss": 0.0351, "reward": 0.7994791865348816, "reward_std": 0.25493843853473663, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424479179084301, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 867.0104370117188, "epoch": 0.12586666666666665, "grad_norm": 0.21208207281989394, "kl": 0.0047149658203125, "learning_rate": 9.981970223546364e-07, "loss": 0.0405, "reward": 0.8437500223517418, "reward_std": 0.4021975174546242, "rewards/accuracy_reward": 0.4270833544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.416666679084301, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 772.7291870117188, "epoch": 0.128, "grad_norm": 0.2211569911681807, "kl": 0.003696441650390625, "learning_rate": 9.97884250899641e-07, "loss": 0.0368, "reward": 1.0312500298023224, "reward_std": 0.21990002878010273, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4895833507180214, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 899.7708587646484, "epoch": 0.13013333333333332, "grad_norm": 0.11014958756420937, "kl": 0.00301361083984375, "learning_rate": 9.975465392792135e-07, "loss": 0.0015, "reward": 0.6744791865348816, "reward_std": 0.16699286550283432, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3515625074505806, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 913.7291870117188, "epoch": 0.13226666666666667, "grad_norm": 0.18919373882455442, "kl": 0.003414154052734375, "learning_rate": 9.971839062986228e-07, "loss": 0.0458, "reward": 0.708333358168602, "reward_std": 0.37071289867162704, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500149011612, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 850.1354522705078, "epoch": 0.1344, "grad_norm": 0.22143875732066168, "kl": 0.00384521484375, "learning_rate": 9.967963721508683e-07, "loss": 0.0392, "reward": 0.7526041865348816, "reward_std": 0.27115987055003643, "rewards/accuracy_reward": 0.3333333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4192708432674408, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 963.1979370117188, "epoch": 0.13653333333333334, "grad_norm": 0.1550262557067119, "kl": 0.0036773681640625, "learning_rate": 9.963839584155564e-07, "loss": 0.0294, "reward": 0.5000000149011612, "reward_std": 0.238736130297184, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3541666716337204, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 723.5833435058594, "epoch": 0.13866666666666666, "grad_norm": 0.20889664359051643, "kl": 0.0074310302734375, "learning_rate": 9.95946688057698e-07, "loss": 0.0291, "reward": 0.9791666716337204, "reward_std": 0.2577291578054428, "rewards/accuracy_reward": 0.5312500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4479166716337204, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 785.0520935058594, "epoch": 0.1408, "grad_norm": 0.22320299690399084, "kl": 0.005214691162109375, "learning_rate": 9.954845854264304e-07, "loss": 0.0318, "reward": 0.8854167014360428, "reward_std": 0.38225920498371124, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833432674408, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 687.5521011352539, "epoch": 0.14293333333333333, "grad_norm": 0.26192399966053026, "kl": 0.00640869140625, "learning_rate": 9.949976762536612e-07, "loss": 0.0332, "reward": 1.0598958432674408, "reward_std": 0.38580355048179626, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4973958507180214, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 816.4583435058594, "epoch": 0.14506666666666668, "grad_norm": 0.2195932504857594, "kl": 0.006011962890625, "learning_rate": 9.944859876526347e-07, "loss": 0.0482, "reward": 0.8671875223517418, "reward_std": 0.42579207941889763, "rewards/accuracy_reward": 0.3958333386108279, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4713541865348816, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 763.9895935058594, "epoch": 0.1472, "grad_norm": 0.19845257741384642, "kl": 0.006496429443359375, "learning_rate": 9.939495481164237e-07, "loss": 0.0259, "reward": 1.1302083730697632, "reward_std": 0.3065594360232353, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5052083432674408, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 714.0520935058594, "epoch": 0.14933333333333335, "grad_norm": 0.22169840225751747, "kl": 0.005207061767578125, "learning_rate": 9.933883875163411e-07, "loss": 0.055, "reward": 1.057291716337204, "reward_std": 0.2972635589540005, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4947916865348816, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 880.4791870117188, "epoch": 0.15146666666666667, "grad_norm": 0.23367859778525008, "kl": 0.00757598876953125, "learning_rate": 9.928025371002782e-07, "loss": 0.067, "reward": 0.7057291865348816, "reward_std": 0.34901827573776245, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4244791716337204, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 873.2604370117188, "epoch": 0.1536, "grad_norm": 0.18132372403520364, "kl": 0.00620269775390625, "learning_rate": 9.921920294909627e-07, "loss": 0.0377, "reward": 0.7057291939854622, "reward_std": 0.35817644000053406, "rewards/accuracy_reward": 0.2916666753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625149011612, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 738.1979370117188, "epoch": 0.15573333333333333, "grad_norm": 0.22228134275480527, "kl": 0.00677490234375, "learning_rate": 9.91556898684145e-07, "loss": 0.0601, "reward": 1.1302083879709244, "reward_std": 0.30970917269587517, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.505208358168602, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 821.6145935058594, "epoch": 0.15786666666666666, "grad_norm": 0.20389266404950881, "kl": 0.00748443603515625, "learning_rate": 9.90897180046702e-07, "loss": 0.0303, "reward": 0.934895858168602, "reward_std": 0.29574301838874817, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.466145858168602, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 802.5000305175781, "epoch": 0.16, "grad_norm": 0.2024163287984723, "kl": 0.00734710693359375, "learning_rate": 9.902129103146697e-07, "loss": 0.0486, "reward": 0.8515625149011612, "reward_std": 0.2905624881386757, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4348958432674408, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 846.8646087646484, "epoch": 0.16213333333333332, "grad_norm": 0.2311059289379649, "kl": 0.00652313232421875, "learning_rate": 9.89504127591197e-07, "loss": 0.0571, "reward": 0.7942708507180214, "reward_std": 0.30027635395526886, "rewards/accuracy_reward": 0.3854166828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4088541716337204, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 878.6458587646484, "epoch": 0.16426666666666667, "grad_norm": 0.15628470133135228, "kl": 0.00507354736328125, "learning_rate": 9.887708713444242e-07, "loss": 0.0275, "reward": 0.799479179084301, "reward_std": 0.356827512383461, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4036458432674408, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 754.3541870117188, "epoch": 0.1664, "grad_norm": 0.23480771387358682, "kl": 0.00949859619140625, "learning_rate": 9.880131824052848e-07, "loss": 0.0503, "reward": 1.0078125447034836, "reward_std": 0.3376114219427109, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4661458507180214, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 807.8646087646484, "epoch": 0.16853333333333334, "grad_norm": 0.2549074989608758, "kl": 0.0102691650390625, "learning_rate": 9.87231102965232e-07, "loss": 0.0761, "reward": 0.755208358168602, "reward_std": 0.3330737203359604, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250074505806, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 842.8229370117188, "epoch": 0.17066666666666666, "grad_norm": 0.16239962220325158, "kl": 0.00743865966796875, "learning_rate": 9.8642467657389e-07, "loss": 0.0406, "reward": 0.7812500074505806, "reward_std": 0.24844172969460487, "rewards/accuracy_reward": 0.35416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833432674408, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 831.6562652587891, "epoch": 0.1728, "grad_norm": 0.19487106047627312, "kl": 0.00650787353515625, "learning_rate": 9.855939481366275e-07, "loss": 0.0389, "reward": 0.7890625149011612, "reward_std": 0.27398327738046646, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4348958358168602, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 764.9062652587891, "epoch": 0.17493333333333333, "grad_norm": 0.24325209901582961, "kl": 0.008331298828125, "learning_rate": 9.847389639120585e-07, "loss": 0.0106, "reward": 1.0130208432674408, "reward_std": 0.4021530821919441, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375149011612, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 819.7604522705078, "epoch": 0.17706666666666668, "grad_norm": 0.2209085097836123, "kl": 0.0158843994140625, "learning_rate": 9.83859771509466e-07, "loss": 0.0379, "reward": 0.7812500149011612, "reward_std": 0.33020088635385036, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4270833507180214, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 892.4687805175781, "epoch": 0.1792, "grad_norm": 0.1908083102962245, "kl": 0.00670623779296875, "learning_rate": 9.829564198861508e-07, "loss": 0.0521, "reward": 0.7994792014360428, "reward_std": 0.3195120617747307, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424479179084301, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 867.65625, "epoch": 0.18133333333333335, "grad_norm": 0.19210591117215867, "kl": 0.00887298583984375, "learning_rate": 9.820289593447051e-07, "loss": 0.0358, "reward": 0.8593750149011612, "reward_std": 0.313580647110939, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250149011612, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 825.9375152587891, "epoch": 0.18346666666666667, "grad_norm": 0.19047008974679128, "kl": 0.008056640625, "learning_rate": 9.810774415302124e-07, "loss": 0.0366, "reward": 0.7890625298023224, "reward_std": 0.30688632279634476, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4348958432674408, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 790.0312652587891, "epoch": 0.1856, "grad_norm": 0.2076366117901631, "kl": 0.0073089599609375, "learning_rate": 9.8010191942737e-07, "loss": 0.0695, "reward": 0.9557292014360428, "reward_std": 0.3128313571214676, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625074505806, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 839.0416717529297, "epoch": 0.18773333333333334, "grad_norm": 0.18856368038348906, "kl": 0.00917816162109375, "learning_rate": 9.791024473575404e-07, "loss": 0.058, "reward": 0.7760417014360428, "reward_std": 0.3489021761342883, "rewards/accuracy_reward": 0.3437500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.432291679084301, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 792.1562652587891, "epoch": 0.18986666666666666, "grad_norm": 0.2399608562140009, "kl": 0.00940704345703125, "learning_rate": 9.780790809757253e-07, "loss": 0.078, "reward": 1.046875, "reward_std": 0.3642818257212639, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.494791679084301, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 688.6875152587891, "epoch": 0.192, "grad_norm": 0.2577576037099253, "kl": 0.0142364501953125, "learning_rate": 9.770318772674668e-07, "loss": 0.0516, "reward": 1.1718750447034836, "reward_std": 0.2594817951321602, "rewards/accuracy_reward": 0.6458333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.526041679084301, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 844.0416870117188, "epoch": 0.19413333333333332, "grad_norm": 0.2610335490987318, "kl": 0.0152130126953125, "learning_rate": 9.759608945456744e-07, "loss": 0.0783, "reward": 0.8567708432674408, "reward_std": 0.3492202013731003, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375223517418, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 721.5416870117188, "epoch": 0.19626666666666667, "grad_norm": 0.30445853959364694, "kl": 0.0128326416015625, "learning_rate": 9.748661924473775e-07, "loss": 0.0666, "reward": 1.0078125447034836, "reward_std": 0.365444540977478, "rewards/accuracy_reward": 0.4687500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625149011612, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 760.5521087646484, "epoch": 0.1984, "grad_norm": 0.26398567569252307, "kl": 0.0128021240234375, "learning_rate": 9.737478319304048e-07, "loss": 0.0602, "reward": 1.0963541865348816, "reward_std": 0.28444724529981613, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.533854179084301, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 745.0729217529297, "epoch": 0.20053333333333334, "grad_norm": 0.31783151457154274, "kl": 0.0146636962890625, "learning_rate": 9.726058752699897e-07, "loss": 0.0404, "reward": 1.0390625298023224, "reward_std": 0.3004063665866852, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625298023224, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 782.3646087646484, "epoch": 0.20266666666666666, "grad_norm": 0.262805669568343, "kl": 0.0127410888671875, "learning_rate": 9.714403860553027e-07, "loss": 0.0752, "reward": 0.9635417014360428, "reward_std": 0.34173475950956345, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5260416865348816, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 635.7708587646484, "epoch": 0.2048, "grad_norm": 0.2986462792289919, "kl": 0.0128631591796875, "learning_rate": 9.702514291859108e-07, "loss": 0.0892, "reward": 1.3906250596046448, "reward_std": 0.3887404687702656, "rewards/accuracy_reward": 0.7187500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 742.9791870117188, "epoch": 0.20693333333333333, "grad_norm": 0.23266958044606426, "kl": 0.01507568359375, "learning_rate": 9.690390708681624e-07, "loss": 0.0609, "reward": 1.1848958432674408, "reward_std": 0.3821899965405464, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625149011612, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 807.4375152587891, "epoch": 0.20906666666666668, "grad_norm": 0.28427091318075753, "kl": 0.0138397216796875, "learning_rate": 9.678033786115028e-07, "loss": 0.0799, "reward": 0.9296875298023224, "reward_std": 0.39811836928129196, "rewards/accuracy_reward": 0.4270833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.502604179084301, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 690.6250152587891, "epoch": 0.2112, "grad_norm": 0.33385067368025745, "kl": 0.0172576904296875, "learning_rate": 9.665444212247126e-07, "loss": 0.1125, "reward": 1.174479216337204, "reward_std": 0.4053206965327263, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 703.5104370117188, "epoch": 0.21333333333333335, "grad_norm": 0.26763878695230725, "kl": 0.0191497802734375, "learning_rate": 9.652622688120774e-07, "loss": 0.0934, "reward": 1.2343750596046448, "reward_std": 0.4297167584300041, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6406250149011612, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 729.8542022705078, "epoch": 0.21546666666666667, "grad_norm": 0.2604446935793223, "kl": 0.01666259765625, "learning_rate": 9.639569927694842e-07, "loss": 0.0484, "reward": 1.244791716337204, "reward_std": 0.2556636780500412, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5989583507180214, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 715.8229217529297, "epoch": 0.2176, "grad_norm": 0.29543708158686033, "kl": 0.020050048828125, "learning_rate": 9.626286657804454e-07, "loss": 0.0634, "reward": 1.127604216337204, "reward_std": 0.31952518224716187, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5963541865348816, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 719.2500305175781, "epoch": 0.21973333333333334, "grad_norm": 0.2824191072667187, "kl": 0.020294189453125, "learning_rate": 9.612773618120509e-07, "loss": 0.0619, "reward": 1.1067708432674408, "reward_std": 0.4568670988082886, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.575520858168602, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 800.2395935058594, "epoch": 0.22186666666666666, "grad_norm": 0.29815196332100213, "kl": 0.015228271484375, "learning_rate": 9.599031561108505e-07, "loss": 0.0637, "reward": 0.8593750223517418, "reward_std": 0.3243005946278572, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463541679084301, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 683.0625305175781, "epoch": 0.224, "grad_norm": 0.29748629948846467, "kl": 0.01812744140625, "learning_rate": 9.585061251986632e-07, "loss": 0.062, "reward": 1.263020858168602, "reward_std": 0.35825271904468536, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6588541865348816, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 728.7708587646484, "epoch": 0.22613333333333333, "grad_norm": 0.3282857287460221, "kl": 0.020751953125, "learning_rate": 9.57086346868316e-07, "loss": 0.1268, "reward": 1.1276041865348816, "reward_std": 0.4390442371368408, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5755208432674408, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 777.3021087646484, "epoch": 0.22826666666666667, "grad_norm": 0.31736765287866603, "kl": 0.021881103515625, "learning_rate": 9.556439001793124e-07, "loss": 0.0837, "reward": 0.9401041865348816, "reward_std": 0.4035160690546036, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5338541939854622, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 826.1354370117188, "epoch": 0.2304, "grad_norm": 0.30615838328853484, "kl": 0.022369384765625, "learning_rate": 9.541788654534294e-07, "loss": 0.0902, "reward": 0.9921875447034836, "reward_std": 0.44786881655454636, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875149011612, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 706.3541870117188, "epoch": 0.23253333333333334, "grad_norm": 0.3044969299318315, "kl": 0.0216064453125, "learning_rate": 9.526913242702458e-07, "loss": 0.0482, "reward": 1.0520833432674408, "reward_std": 0.3498193696141243, "rewards/accuracy_reward": 0.42708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6250000149011612, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 789.9270935058594, "epoch": 0.23466666666666666, "grad_norm": 0.31077022412663985, "kl": 0.0228118896484375, "learning_rate": 9.511813594625986e-07, "loss": 0.0936, "reward": 1.0364583730697632, "reward_std": 0.45415426790714264, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5468750149011612, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 782.9270935058594, "epoch": 0.2368, "grad_norm": 0.33311202322553307, "kl": 0.03173828125, "learning_rate": 9.496490551119708e-07, "loss": 0.0456, "reward": 1.0807292014360428, "reward_std": 0.43023916706442833, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5598958358168602, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 797.3958587646484, "epoch": 0.23893333333333333, "grad_norm": 0.27289293925345737, "kl": 0.0220947265625, "learning_rate": 9.480944965438097e-07, "loss": 0.0822, "reward": 0.8541666865348816, "reward_std": 0.2803124524652958, "rewards/accuracy_reward": 0.3541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5000000223517418, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 728.5416870117188, "epoch": 0.24106666666666668, "grad_norm": 0.3606704422913038, "kl": 0.029632568359375, "learning_rate": 9.465177703227755e-07, "loss": 0.0869, "reward": 1.0677083730697632, "reward_std": 0.37738659232854843, "rewards/accuracy_reward": 0.510416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5572916865348816, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 652.7812805175781, "epoch": 0.2432, "grad_norm": 0.3584790769205954, "kl": 0.025115966796875, "learning_rate": 9.449189642479202e-07, "loss": 0.1074, "reward": 1.213541716337204, "reward_std": 0.4059242531657219, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5885416865348816, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 725.1146087646484, "epoch": 0.24533333333333332, "grad_norm": 0.4024464636552969, "kl": 0.023956298828125, "learning_rate": 9.432981673477996e-07, "loss": 0.1087, "reward": 0.911458358168602, "reward_std": 0.3445095419883728, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250149011612, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 698.2292022705078, "epoch": 0.24746666666666667, "grad_norm": 0.2829082400468053, "kl": 0.023193359375, "learning_rate": 9.416554698755153e-07, "loss": 0.0251, "reward": 1.151041716337204, "reward_std": 0.3484726771712303, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416716337204, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 695.8958587646484, "epoch": 0.2496, "grad_norm": 0.4657128990625891, "kl": 0.03369140625, "learning_rate": 9.399909633036895e-07, "loss": 0.1124, "reward": 1.1406250298023224, "reward_std": 0.39415134489536285, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250223517418, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 601.6979370117188, "epoch": 0.2517333333333333, "grad_norm": 0.4438313112348482, "kl": 0.025787353515625, "learning_rate": 9.383047403193702e-07, "loss": 0.0863, "reward": 1.3020833730697632, "reward_std": 0.2631511315703392, "rewards/accuracy_reward": 0.635416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6666666865348816, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 712.4062652587891, "epoch": 0.2538666666666667, "grad_norm": 0.3343335198346674, "kl": 0.031707763671875, "learning_rate": 9.365968948188716e-07, "loss": 0.0373, "reward": 1.1718750447034836, "reward_std": 0.3704974502325058, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416865348816, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 807.2187805175781, "epoch": 0.256, "grad_norm": 0.44264007327871097, "kl": 0.03948974609375, "learning_rate": 9.348675219025442e-07, "loss": 0.065, "reward": 0.8593750149011612, "reward_std": 0.3359655812382698, "rewards/accuracy_reward": 0.34375002048909664, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5156250149011612, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 585.8541793823242, "epoch": 0.2581333333333333, "grad_norm": 0.44710975229663324, "kl": 0.041748046875, "learning_rate": 9.331167178694797e-07, "loss": 0.0672, "reward": 1.3854166865348816, "reward_std": 0.3670189455151558, "rewards/accuracy_reward": 0.645833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.739583358168602, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 586.9270935058594, "epoch": 0.26026666666666665, "grad_norm": 0.5132333892556646, "kl": 0.03607177734375, "learning_rate": 9.313445802121493e-07, "loss": 0.0729, "reward": 1.2630209028720856, "reward_std": 0.3291833885014057, "rewards/accuracy_reward": 0.5937500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6692708432674408, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 639.3125152587891, "epoch": 0.2624, "grad_norm": 0.5632492443313848, "kl": 0.0504150390625, "learning_rate": 9.295512076109733e-07, "loss": 0.1126, "reward": 1.2135416865348816, "reward_std": 0.40659692883491516, "rewards/accuracy_reward": 0.5625000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416716337204, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 708.8229370117188, "epoch": 0.26453333333333334, "grad_norm": 0.7405631059644997, "kl": 0.051025390625, "learning_rate": 9.277366999288277e-07, "loss": 0.187, "reward": 1.0755208879709244, "reward_std": 0.4825942814350128, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6588541865348816, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 683.0520935058594, "epoch": 0.26666666666666666, "grad_norm": 0.9799980392473033, "kl": 0.05389404296875, "learning_rate": 9.259011582054829e-07, "loss": 0.1302, "reward": 0.942708358168602, "reward_std": 0.4361359179019928, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750149011612, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 700.6875152587891, "epoch": 0.2688, "grad_norm": 0.6649953700875584, "kl": 0.0589599609375, "learning_rate": 9.240446846519767e-07, "loss": 0.0771, "reward": 1.2239583730697632, "reward_std": 0.3370389975607395, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 631.2708587646484, "epoch": 0.27093333333333336, "grad_norm": 0.7556931317033608, "kl": 0.05859375, "learning_rate": 9.221673826449239e-07, "loss": 0.1097, "reward": 1.3515625298023224, "reward_std": 0.3902127370238304, "rewards/accuracy_reward": 0.6875000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 710.3333587646484, "epoch": 0.2730666666666667, "grad_norm": 0.6862132500429674, "kl": 0.07720947265625, "learning_rate": 9.202693567207587e-07, "loss": 0.0315, "reward": 1.1562500298023224, "reward_std": 0.4864039123058319, "rewards/accuracy_reward": 0.5104166939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6458333432674408, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 697.7604370117188, "epoch": 0.2752, "grad_norm": 1.1345184536394581, "kl": 0.0782470703125, "learning_rate": 9.183507125699143e-07, "loss": 0.0796, "reward": 1.1901041865348816, "reward_std": 0.46896427869796753, "rewards/accuracy_reward": 0.5416666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375149011612, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 780.9896087646484, "epoch": 0.2773333333333333, "grad_norm": 1.2758278535685441, "kl": 0.10870361328125, "learning_rate": 9.164115570309379e-07, "loss": 0.1151, "reward": 1.0130208432674408, "reward_std": 0.46661972999572754, "rewards/accuracy_reward": 0.4479166818782687, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565104179084301, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 687.6771087646484, "epoch": 0.27946666666666664, "grad_norm": 1.0505806863206573, "kl": 0.11328125, "learning_rate": 9.144519980845404e-07, "loss": 0.0867, "reward": 1.1718750298023224, "reward_std": 0.37612347677350044, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416865348816, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 797.9479370117188, "epoch": 0.2816, "grad_norm": 1.2170240378062362, "kl": 0.14501953125, "learning_rate": 9.124721448475846e-07, "loss": 0.0487, "reward": 0.8776041865348816, "reward_std": 0.3442392908036709, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.544270858168602, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 749.1146087646484, "epoch": 0.28373333333333334, "grad_norm": 1.7369718821049942, "kl": 0.1512451171875, "learning_rate": 9.104721075670086e-07, "loss": 0.0821, "reward": 1.0468750298023224, "reward_std": 0.43739401549100876, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750298023224, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 691.3437805175781, "epoch": 0.28586666666666666, "grad_norm": 1.2882430691135234, "kl": 0.1466064453125, "learning_rate": 9.084519976136866e-07, "loss": 0.0787, "reward": 1.1380208730697632, "reward_std": 0.37952160835266113, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6380208432674408, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 734.0833435058594, "epoch": 0.288, "grad_norm": 0.851150504615743, "kl": 0.19677734375, "learning_rate": 9.064119274762277e-07, "loss": 0.0263, "reward": 1.1067708730697632, "reward_std": 0.40093619376420975, "rewards/accuracy_reward": 0.5000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6067708432674408, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 571.4166793823242, "epoch": 0.29013333333333335, "grad_norm": 1.5391538331108987, "kl": 0.22900390625, "learning_rate": 9.043520107547121e-07, "loss": 0.0605, "reward": 1.3593750298023224, "reward_std": 0.4824457913637161, "rewards/accuracy_reward": 0.6770833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 760.6146087646484, "epoch": 0.2922666666666667, "grad_norm": 1.8702095195613229, "kl": 0.34130859375, "learning_rate": 9.022723621543649e-07, "loss": 0.0721, "reward": 1.2109375298023224, "reward_std": 0.5021077692508698, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375149011612, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 572.3646087646484, "epoch": 0.2944, "grad_norm": 2.0067460857326798, "kl": 0.27685546875, "learning_rate": 9.001730974791688e-07, "loss": 0.0303, "reward": 1.307291716337204, "reward_std": 0.37274327129125595, "rewards/accuracy_reward": 0.5937500298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7135416865348816, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 742.8854522705078, "epoch": 0.2965333333333333, "grad_norm": 2.727496225681536, "kl": 0.50390625, "learning_rate": 8.980543336254161e-07, "loss": 0.0822, "reward": 1.0390625298023224, "reward_std": 0.39713528752326965, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6223958432674408, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 694.0416870117188, "epoch": 0.2986666666666667, "grad_norm": 1.6554617304448012, "kl": 0.64404296875, "learning_rate": 8.95916188575199e-07, "loss": 0.1016, "reward": 1.0364583432674408, "reward_std": 0.39632973819971085, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5885417014360428, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 703.0520935058594, "epoch": 0.3008, "grad_norm": 2.3471939112303204, "kl": 0.3349609375, "learning_rate": 8.937587813898401e-07, "loss": 0.0249, "reward": 1.1250000447034836, "reward_std": 0.4193038195371628, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6041666865348816, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 614.1458435058594, "epoch": 0.30293333333333333, "grad_norm": 3.554057508710156, "kl": 0.64013671875, "learning_rate": 8.915822322032628e-07, "loss": 0.0336, "reward": 1.2005208879709244, "reward_std": 0.4515683874487877, "rewards/accuracy_reward": 0.5312500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.669270858168602, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 626.9166870117188, "epoch": 0.30506666666666665, "grad_norm": 2.348762447932129, "kl": 0.55615234375, "learning_rate": 8.893866622153005e-07, "loss": 0.0428, "reward": 1.1328125298023224, "reward_std": 0.4083319380879402, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.653645858168602, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 702.0521087646484, "epoch": 0.3072, "grad_norm": 5.644818619662011, "kl": 0.445556640625, "learning_rate": 8.871721936849489e-07, "loss": 0.0297, "reward": 1.0390625447034836, "reward_std": 0.4139084219932556, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625149011612, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 586.5416793823242, "epoch": 0.30933333333333335, "grad_norm": 8.550257693587795, "kl": 0.481689453125, "learning_rate": 8.849389499235579e-07, "loss": 0.0437, "reward": 1.3359375298023224, "reward_std": 0.4612661302089691, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208730697632, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 597.4270935058594, "epoch": 0.31146666666666667, "grad_norm": 3.0532728962697826, "kl": 0.61328125, "learning_rate": 8.826870552879645e-07, "loss": -0.0264, "reward": 1.1015625447034836, "reward_std": 0.5034219920635223, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6536458432674408, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 655.8437652587891, "epoch": 0.3136, "grad_norm": 3.124971384714811, "kl": 0.408935546875, "learning_rate": 8.804166351735689e-07, "loss": -0.0965, "reward": 1.0911458879709244, "reward_std": 0.4024947062134743, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125149011612, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 598.2812728881836, "epoch": 0.3157333333333333, "grad_norm": 1.6524780740950624, "kl": 0.401611328125, "learning_rate": 8.781278160073508e-07, "loss": 0.0663, "reward": 1.0781250596046448, "reward_std": 0.3967272564768791, "rewards/accuracy_reward": 0.4062500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 531.8958435058594, "epoch": 0.3178666666666667, "grad_norm": 2.309440251724059, "kl": 0.71435546875, "learning_rate": 8.758207252408305e-07, "loss": -0.0223, "reward": 1.2942708730697632, "reward_std": 0.46261265128850937, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708432674408, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 553.5312652587891, "epoch": 0.32, "grad_norm": 2.4367930100465056, "kl": 0.688232421875, "learning_rate": 8.734954913429713e-07, "loss": 0.1082, "reward": 1.2500000298023224, "reward_std": 0.41490884870290756, "rewards/accuracy_reward": 0.5937500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.65625, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 648.5104217529297, "epoch": 0.3221333333333333, "grad_norm": 2.8900176756676266, "kl": 0.9423828125, "learning_rate": 8.71152243793026e-07, "loss": 0.0808, "reward": 1.1380208730697632, "reward_std": 0.48239949345588684, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375298023224, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 559.2396011352539, "epoch": 0.32426666666666665, "grad_norm": 2.9946090703577526, "kl": 0.69921875, "learning_rate": 8.687911130733266e-07, "loss": 0.0651, "reward": 1.377604216337204, "reward_std": 0.5387382358312607, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 600.0625305175781, "epoch": 0.3264, "grad_norm": 4.34193749706627, "kl": 1.0322265625, "learning_rate": 8.664122306620184e-07, "loss": 0.1012, "reward": 1.229166716337204, "reward_std": 0.4584341421723366, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000149011612, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 699.4583587646484, "epoch": 0.32853333333333334, "grad_norm": 2.4506518251356577, "kl": 0.83935546875, "learning_rate": 8.640157290257396e-07, "loss": 0.1104, "reward": 1.0208333730697632, "reward_std": 0.46958719938993454, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6666666865348816, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 566.9271087646484, "epoch": 0.33066666666666666, "grad_norm": 5.401034584726453, "kl": 1.1689453125, "learning_rate": 8.61601741612244e-07, "loss": 0.2134, "reward": 1.2890625596046448, "reward_std": 0.44453180953860283, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625149011612, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 633.0104217529297, "epoch": 0.3328, "grad_norm": 2.927726674982878, "kl": 0.76513671875, "learning_rate": 8.591704028429703e-07, "loss": 0.0545, "reward": 1.1432291865348816, "reward_std": 0.4252583682537079, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7057291865348816, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 515.8021011352539, "epoch": 0.33493333333333336, "grad_norm": 7.314880509347044, "kl": 2.34375, "learning_rate": 8.567218481055575e-07, "loss": 0.2219, "reward": 1.2838541865348816, "reward_std": 0.5363652482628822, "rewards/accuracy_reward": 0.604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6796875298023224, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 593.1250152587891, "epoch": 0.3370666666666667, "grad_norm": 6.2789742564910505, "kl": 1.208984375, "learning_rate": 8.542562137463047e-07, "loss": 0.0515, "reward": 1.1979166865348816, "reward_std": 0.36978180706501007, "rewards/accuracy_reward": 0.4895833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7083333432674408, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 582.3854217529297, "epoch": 0.3392, "grad_norm": 5.297033801278587, "kl": 1.0830078125, "learning_rate": 8.517736370625802e-07, "loss": 0.1485, "reward": 1.3645833730697632, "reward_std": 0.4559536874294281, "rewards/accuracy_reward": 0.6979166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6666666865348816, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 647.9687652587891, "epoch": 0.3413333333333333, "grad_norm": 2.9904403312603587, "kl": 1.26806640625, "learning_rate": 8.492742562951751e-07, "loss": 0.1099, "reward": 1.096354216337204, "reward_std": 0.34164173156023026, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208432674408, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 631.9375305175781, "epoch": 0.34346666666666664, "grad_norm": 5.522771711959092, "kl": 1.095703125, "learning_rate": 8.467582106206057e-07, "loss": 0.1403, "reward": 1.1276042014360428, "reward_std": 0.4164763018488884, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.010416666977107525, "rewards/tag_count_reward": 0.6796875149011612, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 692.1562652587891, "epoch": 0.3456, "grad_norm": 6.080573070345314, "kl": 1.0849609375, "learning_rate": 8.44225640143364e-07, "loss": 0.0824, "reward": 1.0390625298023224, "reward_std": 0.4599665552377701, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625298023224, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 720.3333587646484, "epoch": 0.34773333333333334, "grad_norm": 2.824643415041674, "kl": 0.73681640625, "learning_rate": 8.416766858881155e-07, "loss": 0.043, "reward": 1.0598958730697632, "reward_std": 0.4137251526117325, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.684895858168602, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 589.7500305175781, "epoch": 0.34986666666666666, "grad_norm": 3.1784612712111007, "kl": 0.59130859375, "learning_rate": 8.391114897918462e-07, "loss": 0.0992, "reward": 1.3046875596046448, "reward_std": 0.3715285286307335, "rewards/accuracy_reward": 0.5937500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375149011612, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 737.8229370117188, "epoch": 0.352, "grad_norm": 3.0199017235674552, "kl": 1.55859375, "learning_rate": 8.3653019469596e-07, "loss": 0.1017, "reward": 1.0104166716337204, "reward_std": 0.36515790969133377, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6041666865348816, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 761.9166870117188, "epoch": 0.35413333333333336, "grad_norm": 2.6556502283004013, "kl": 0.80078125, "learning_rate": 8.339329443383233e-07, "loss": 0.0712, "reward": 1.013020858168602, "reward_std": 0.407824095338583, "rewards/accuracy_reward": 0.3645833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 632.3646087646484, "epoch": 0.3562666666666667, "grad_norm": 1.4394054723868772, "kl": 0.443359375, "learning_rate": 8.313198833452622e-07, "loss": 0.0856, "reward": 1.0781250298023224, "reward_std": 0.36224906146526337, "rewards/accuracy_reward": 0.35416668467223644, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583432674408, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 596.3854446411133, "epoch": 0.3584, "grad_norm": 3.231762913084113, "kl": 1.1796875, "learning_rate": 8.286911572235079e-07, "loss": 0.1796, "reward": 1.424479216337204, "reward_std": 0.4078570008277893, "rewards/accuracy_reward": 0.6770833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.747395858168602, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 590.2083435058594, "epoch": 0.3605333333333333, "grad_norm": 10.88252152738768, "kl": 2.607421875, "learning_rate": 8.260469123520953e-07, "loss": 0.2273, "reward": 1.3489584028720856, "reward_std": 0.46935708820819855, "rewards/accuracy_reward": 0.614583358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 612.3125152587891, "epoch": 0.3626666666666667, "grad_norm": 6.442990175492329, "kl": 2.0, "learning_rate": 8.233872959742116e-07, "loss": 0.2326, "reward": 1.1302083730697632, "reward_std": 0.4670635610818863, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 623.0937652587891, "epoch": 0.3648, "grad_norm": 1.8353876571256793, "kl": 0.764892578125, "learning_rate": 8.207124561889967e-07, "loss": 0.0401, "reward": 1.2161458730697632, "reward_std": 0.39249349758028984, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6119791865348816, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 589.8021087646484, "epoch": 0.36693333333333333, "grad_norm": 28.70359041511556, "kl": 4.802001953125, "learning_rate": 8.180225419432973e-07, "loss": 0.4525, "reward": 1.3203125, "reward_std": 0.4753311946988106, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7161458432674408, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 659.8125305175781, "epoch": 0.36906666666666665, "grad_norm": 2.7371530957315326, "kl": 1.0400390625, "learning_rate": 8.15317703023372e-07, "loss": -0.0115, "reward": 1.1015625298023224, "reward_std": 0.4031752720475197, "rewards/accuracy_reward": 0.3854166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 594.1042022705078, "epoch": 0.3712, "grad_norm": 3.4265160026521544, "kl": 1.716796875, "learning_rate": 8.125980900465511e-07, "loss": 0.1577, "reward": 1.2083333730697632, "reward_std": 0.3657406345009804, "rewards/accuracy_reward": 0.5312500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6770833432674408, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 611.8229370117188, "epoch": 0.37333333333333335, "grad_norm": 5.3357300756416555, "kl": 0.9595947265625, "learning_rate": 8.098638544528493e-07, "loss": 0.0992, "reward": 1.3906250298023224, "reward_std": 0.4218136966228485, "rewards/accuracy_reward": 0.6250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7656250298023224, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 607.4791793823242, "epoch": 0.37546666666666667, "grad_norm": 3.058337503705944, "kl": 1.177734375, "learning_rate": 8.071151484965328e-07, "loss": 0.1292, "reward": 1.1067708730697632, "reward_std": 0.4495965465903282, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375298023224, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 651.0104370117188, "epoch": 0.3776, "grad_norm": 2.7653663091043246, "kl": 0.4844970703125, "learning_rate": 8.043521252376417e-07, "loss": 0.0987, "reward": 1.315104216337204, "reward_std": 0.3565051704645157, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7526041865348816, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 625.5625152587891, "epoch": 0.3797333333333333, "grad_norm": 1.4254748050396235, "kl": 0.68603515625, "learning_rate": 8.015749385334661e-07, "loss": 0.0841, "reward": 1.2630208730697632, "reward_std": 0.3636063262820244, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708432674408, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 632.1979370117188, "epoch": 0.3818666666666667, "grad_norm": 4.394185818927082, "kl": 0.69482421875, "learning_rate": 7.987837430299792e-07, "loss": 0.1899, "reward": 1.2942708730697632, "reward_std": 0.401991605758667, "rewards/accuracy_reward": 0.5520833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875149011612, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 569.0208587646484, "epoch": 0.384, "grad_norm": 1.6852257830359076, "kl": 0.3607177734375, "learning_rate": 7.959786941532256e-07, "loss": -0.0204, "reward": 1.2187500298023224, "reward_std": 0.33338820189237595, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291667014360428, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 675.3854522705078, "epoch": 0.38613333333333333, "grad_norm": 3.9561409906104403, "kl": 0.663818359375, "learning_rate": 7.931599481006668e-07, "loss": 0.0859, "reward": 1.143229216337204, "reward_std": 0.44593609124422073, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 580.1562805175781, "epoch": 0.38826666666666665, "grad_norm": 2.4800102088439426, "kl": 0.46826171875, "learning_rate": 7.903276618324832e-07, "loss": 0.0902, "reward": 1.2265625149011612, "reward_std": 0.4622042179107666, "rewards/accuracy_reward": 0.5000000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 571.9271011352539, "epoch": 0.3904, "grad_norm": 2.251311300102917, "kl": 0.4532470703125, "learning_rate": 7.874819930628346e-07, "loss": 0.0658, "reward": 1.221354216337204, "reward_std": 0.45174503326416016, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541865348816, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 623.9479370117188, "epoch": 0.39253333333333335, "grad_norm": 2.48906947440004, "kl": 0.71728515625, "learning_rate": 7.846231002510761e-07, "loss": 0.0067, "reward": 1.2473958730697632, "reward_std": 0.39163821935653687, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 665.7500152587891, "epoch": 0.39466666666666667, "grad_norm": 2.233069393435031, "kl": 0.56005859375, "learning_rate": 7.817511425929367e-07, "loss": 0.0517, "reward": 1.0703125149011612, "reward_std": 0.41522736102342606, "rewards/accuracy_reward": 0.3541666753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7161458432674408, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 636.0312652587891, "epoch": 0.3968, "grad_norm": 3.3030455877601606, "kl": 0.84326171875, "learning_rate": 7.788662800116533e-07, "loss": 0.0796, "reward": 1.2890625149011612, "reward_std": 0.42196864262223244, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958432674408, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 678.4271087646484, "epoch": 0.3989333333333333, "grad_norm": 2.986964953491577, "kl": 0.4951171875, "learning_rate": 7.759686731490654e-07, "loss": 0.1355, "reward": 1.1796875596046448, "reward_std": 0.3517295569181442, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 697.5521087646484, "epoch": 0.4010666666666667, "grad_norm": 4.877935102719587, "kl": 1.8828125, "learning_rate": 7.730584833566703e-07, "loss": 0.2451, "reward": 0.9791666865348816, "reward_std": 0.4034550338983536, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645833358168602, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 546.7500305175781, "epoch": 0.4032, "grad_norm": 3.1944699506008334, "kl": 1.01123046875, "learning_rate": 7.701358726866384e-07, "loss": 0.0536, "reward": 1.2395833730697632, "reward_std": 0.333698995411396, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 704.6875152587891, "epoch": 0.4053333333333333, "grad_norm": 3.923143130594266, "kl": 1.359375, "learning_rate": 7.672010038827887e-07, "loss": 0.1197, "reward": 1.036458358168602, "reward_std": 0.3906657174229622, "rewards/accuracy_reward": 0.3541666744276881, "rewards/format_reward": 0.010416666977107525, "rewards/tag_count_reward": 0.6718750149011612, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 586.5833587646484, "epoch": 0.40746666666666664, "grad_norm": 3.400512631439386, "kl": 1.1552734375, "learning_rate": 7.642540403715278e-07, "loss": 0.1642, "reward": 1.3359375596046448, "reward_std": 0.4361158236861229, "rewards/accuracy_reward": 0.6250000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375149011612, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 608.7812805175781, "epoch": 0.4096, "grad_norm": 3.6993696865093293, "kl": 1.0460205078125, "learning_rate": 7.61295146252748e-07, "loss": 0.132, "reward": 1.145833358168602, "reward_std": 0.388854943215847, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645833358168602, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 605.6041793823242, "epoch": 0.41173333333333334, "grad_norm": 5.308243001605844, "kl": 1.2568359375, "learning_rate": 7.583244862906906e-07, "loss": 0.2267, "reward": 1.3255208730697632, "reward_std": 0.4222230240702629, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6588542014360428, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 572.1354370117188, "epoch": 0.41386666666666666, "grad_norm": 6.174139506989273, "kl": 1.595703125, "learning_rate": 7.55342225904771e-07, "loss": 0.1736, "reward": 1.2473958730697632, "reward_std": 0.4287296533584595, "rewards/accuracy_reward": 0.5937500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6536458432674408, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 681.6875152587891, "epoch": 0.416, "grad_norm": 3.555603233744486, "kl": 1.49072265625, "learning_rate": 7.523485311603671e-07, "loss": 0.1133, "reward": 1.1354166865348816, "reward_std": 0.3183029256761074, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 762.3333587646484, "epoch": 0.41813333333333336, "grad_norm": 3.1970989941370918, "kl": 1.09765625, "learning_rate": 7.493435687595724e-07, "loss": 0.1391, "reward": 0.8515625298023224, "reward_std": 0.3355184718966484, "rewards/accuracy_reward": 0.2500000027939677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625298023224, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 667.5625152587891, "epoch": 0.4202666666666667, "grad_norm": 5.220267991345135, "kl": 1.666015625, "learning_rate": 7.463275060319126e-07, "loss": 0.2431, "reward": 1.0338542014360428, "reward_std": 0.4331643208861351, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6380208432674408, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 644.3229370117188, "epoch": 0.4224, "grad_norm": 3.4305060066499995, "kl": 1.2626953125, "learning_rate": 7.43300510925029e-07, "loss": 0.102, "reward": 1.2343750447034836, "reward_std": 0.40559985488653183, "rewards/accuracy_reward": 0.583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416865348816, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 695.9479370117188, "epoch": 0.4245333333333333, "grad_norm": 17.742982311990076, "kl": 2.861328125, "learning_rate": 7.40262751995325e-07, "loss": 0.209, "reward": 1.1484375596046448, "reward_std": 0.4041588194668293, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6276042014360428, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 836.7291717529297, "epoch": 0.4266666666666667, "grad_norm": 6.783101729560672, "kl": 1.001953125, "learning_rate": 7.372143983985823e-07, "loss": 0.059, "reward": 0.8515625149011612, "reward_std": 0.30101747065782547, "rewards/accuracy_reward": 0.2708333348855376, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5807291865348816, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 561.3541946411133, "epoch": 0.4288, "grad_norm": 108.92153903454, "kl": 1.4921875, "learning_rate": 7.341556198805391e-07, "loss": 0.224, "reward": 1.0937500447034836, "reward_std": 0.37287120521068573, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.614583358168602, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 777.1354370117188, "epoch": 0.43093333333333333, "grad_norm": 2.9867411539858932, "kl": 0.8515625, "learning_rate": 7.310865867674396e-07, "loss": 0.1169, "reward": 0.9010417014360428, "reward_std": 0.436382420361042, "rewards/accuracy_reward": 0.2812500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6197916865348816, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 782.5937652587891, "epoch": 0.43306666666666666, "grad_norm": 4.115346747345138, "kl": 0.95947265625, "learning_rate": 7.28007469956549e-07, "loss": 0.1098, "reward": 0.8541667014360428, "reward_std": 0.3877977281808853, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5729166865348816, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 703.0729217529297, "epoch": 0.4352, "grad_norm": 3.6906044672073235, "kl": 1.04296875, "learning_rate": 7.249184409066367e-07, "loss": 0.0836, "reward": 1.0989583730697632, "reward_std": 0.3794466406106949, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.598958358168602, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 662.0312652587891, "epoch": 0.43733333333333335, "grad_norm": 10.29645050459395, "kl": 1.0908203125, "learning_rate": 7.218196716284301e-07, "loss": 0.1656, "reward": 1.0833333730697632, "reward_std": 0.49071626365184784, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5833333432674408, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 612.1041946411133, "epoch": 0.43946666666666667, "grad_norm": 6.21786829985612, "kl": 0.70556640625, "learning_rate": 7.187113346750345e-07, "loss": 0.1538, "reward": 1.1692708730697632, "reward_std": 0.3329445421695709, "rewards/accuracy_reward": 0.5208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 797.3020935058594, "epoch": 0.4416, "grad_norm": 7.254838875822759, "kl": 1.216796875, "learning_rate": 7.155936031323254e-07, "loss": 0.173, "reward": 0.9635416865348816, "reward_std": 0.47484801709651947, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5781250149011612, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 700.5937805175781, "epoch": 0.4437333333333333, "grad_norm": 5.539580631333173, "kl": 0.888671875, "learning_rate": 7.124666506093111e-07, "loss": 0.1573, "reward": 1.0078125298023224, "reward_std": 0.3880564123392105, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125149011612, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 710.0729370117188, "epoch": 0.4458666666666667, "grad_norm": 2.6124973150855926, "kl": 1.08203125, "learning_rate": 7.093306512284641e-07, "loss": 0.1311, "reward": 0.9895833730697632, "reward_std": 0.42369063943624496, "rewards/accuracy_reward": 0.3854166828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6041666865348816, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 704.3125152587891, "epoch": 0.448, "grad_norm": 3.0029466909891607, "kl": 1.5205078125, "learning_rate": 7.06185779616026e-07, "loss": 0.1592, "reward": 1.1250000298023224, "reward_std": 0.3417692631483078, "rewards/accuracy_reward": 0.5312500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 737.8333587646484, "epoch": 0.45013333333333333, "grad_norm": 3.5405911740571083, "kl": 1.529296875, "learning_rate": 7.030322108922831e-07, "loss": 0.1676, "reward": 0.9453125298023224, "reward_std": 0.41002119332551956, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6119791716337204, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 655.7291870117188, "epoch": 0.45226666666666665, "grad_norm": 3.3499868360490237, "kl": 1.4091796875, "learning_rate": 6.998701206618152e-07, "loss": 0.2315, "reward": 1.145833358168602, "reward_std": 0.5038532838225365, "rewards/accuracy_reward": 0.5520833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500298023224, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 699.6875152587891, "epoch": 0.4544, "grad_norm": 3.8318683544447714, "kl": 1.29736328125, "learning_rate": 6.966996850037167e-07, "loss": 0.1375, "reward": 1.0937500298023224, "reward_std": 0.4506056234240532, "rewards/accuracy_reward": 0.4479166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6458333432674408, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 597.9687805175781, "epoch": 0.45653333333333335, "grad_norm": 6.442210211914711, "kl": 1.5296630859375, "learning_rate": 6.935210804617932e-07, "loss": 0.1401, "reward": 1.0234375298023224, "reward_std": 0.3827114477753639, "rewards/accuracy_reward": 0.3750000176951289, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 606.8541870117188, "epoch": 0.45866666666666667, "grad_norm": 2.2941469936410193, "kl": 0.8770751953125, "learning_rate": 6.903344840347285e-07, "loss": 0.1111, "reward": 1.1640625447034836, "reward_std": 0.37582574412226677, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6432291865348816, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 727.4479217529297, "epoch": 0.4608, "grad_norm": 4.9236501743742584, "kl": 0.9921875, "learning_rate": 6.871400731662303e-07, "loss": 0.1039, "reward": 1.0494792014360428, "reward_std": 0.40211090445518494, "rewards/accuracy_reward": 0.3958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.653645858168602, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 742.8750152587891, "epoch": 0.4629333333333333, "grad_norm": 4.066568727046315, "kl": 1.62353515625, "learning_rate": 6.839380257351485e-07, "loss": 0.1486, "reward": 0.9739583730697632, "reward_std": 0.3369832746684551, "rewards/accuracy_reward": 0.36458334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6093750149011612, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 568.0104370117188, "epoch": 0.4650666666666667, "grad_norm": 2.4094360656304215, "kl": 1.1103515625, "learning_rate": 6.807285200455708e-07, "loss": 0.137, "reward": 1.2864584028720856, "reward_std": 0.46483898907899857, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822917014360428, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 673.2291717529297, "epoch": 0.4672, "grad_norm": 2.369097215742238, "kl": 0.833984375, "learning_rate": 6.775117348168934e-07, "loss": 0.1527, "reward": 1.0156250447034836, "reward_std": 0.30307888612151146, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5989583432674408, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 641.7708587646484, "epoch": 0.4693333333333333, "grad_norm": 2.9340505919038904, "kl": 0.74609375, "learning_rate": 6.742878491738691e-07, "loss": 0.091, "reward": 1.260416716337204, "reward_std": 0.36950888112187386, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7395833432674408, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 620.0625305175781, "epoch": 0.47146666666666665, "grad_norm": 2.6901909422106742, "kl": 1.22802734375, "learning_rate": 6.710570426366329e-07, "loss": 0.0777, "reward": 1.1744792014360428, "reward_std": 0.4119907468557358, "rewards/accuracy_reward": 0.5208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.653645858168602, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 647.4375, "epoch": 0.4736, "grad_norm": 3.075286484057231, "kl": 1.095703125, "learning_rate": 6.67819495110706e-07, "loss": 0.205, "reward": 1.2552083432674408, "reward_std": 0.3984260931611061, "rewards/accuracy_reward": 0.5833333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 750.0000152587891, "epoch": 0.47573333333333334, "grad_norm": 1.9280597712715668, "kl": 1.0419921875, "learning_rate": 6.645753868769772e-07, "loss": 0.1367, "reward": 1.0416666865348816, "reward_std": 0.4005061313509941, "rewards/accuracy_reward": 0.3854166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500149011612, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 666.4583587646484, "epoch": 0.47786666666666666, "grad_norm": 1.981514239559327, "kl": 1.0439453125, "learning_rate": 6.613248985816649e-07, "loss": 0.151, "reward": 1.1770833730697632, "reward_std": 0.4760870635509491, "rewards/accuracy_reward": 0.5208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500298023224, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 719.6458587646484, "epoch": 0.48, "grad_norm": 3.0499611732687777, "kl": 1.9921875, "learning_rate": 6.580682112262565e-07, "loss": 0.1178, "reward": 1.0390625298023224, "reward_std": 0.3636975698173046, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6119791716337204, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 657.6354370117188, "epoch": 0.48213333333333336, "grad_norm": 2.0077314384783196, "kl": 0.50634765625, "learning_rate": 6.548055061574312e-07, "loss": 0.0396, "reward": 1.0494792014360428, "reward_std": 0.354195736348629, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125149011612, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 584.9166870117188, "epoch": 0.4842666666666667, "grad_norm": 1.6154150634055964, "kl": 0.898193359375, "learning_rate": 6.515369650569602e-07, "loss": 0.0755, "reward": 1.2343750298023224, "reward_std": 0.3991445451974869, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250298023224, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 730.5625152587891, "epoch": 0.4864, "grad_norm": 2.3485701929372054, "kl": 0.856689453125, "learning_rate": 6.482627699315914e-07, "loss": 0.081, "reward": 1.1406250596046448, "reward_std": 0.4424668923020363, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 659.1979370117188, "epoch": 0.4885333333333333, "grad_norm": 2.8605627209621978, "kl": 0.80029296875, "learning_rate": 6.449831031029133e-07, "loss": 0.1213, "reward": 1.114583358168602, "reward_std": 0.3903278261423111, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.677083358168602, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 626.1042022705078, "epoch": 0.49066666666666664, "grad_norm": 3.0192923488140875, "kl": 1.078125, "learning_rate": 6.416981471972025e-07, "loss": 0.0592, "reward": 1.315104216337204, "reward_std": 0.38509828597307205, "rewards/accuracy_reward": 0.6354166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6796875149011612, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 722.8854217529297, "epoch": 0.4928, "grad_norm": 2.281989684222596, "kl": 0.880859375, "learning_rate": 6.384080851352553e-07, "loss": 0.1056, "reward": 1.0182291865348816, "reward_std": 0.3720996528863907, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125298023224, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 626.4062652587891, "epoch": 0.49493333333333334, "grad_norm": 2.224831725483049, "kl": 1.01953125, "learning_rate": 6.351131001222011e-07, "loss": 0.1128, "reward": 1.1406250447034836, "reward_std": 0.41287852823734283, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510417014360428, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 693.6458435058594, "epoch": 0.49706666666666666, "grad_norm": 2.43571312728581, "kl": 0.83447265625, "learning_rate": 6.318133756373009e-07, "loss": 0.0833, "reward": 1.1145833730697632, "reward_std": 0.419599324464798, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6666667014360428, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 707.8437805175781, "epoch": 0.4992, "grad_norm": 4.855894790549249, "kl": 1.2919921875, "learning_rate": 6.285090954237299e-07, "loss": 0.0917, "reward": 0.9609375298023224, "reward_std": 0.3755484074354172, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375298023224, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 679.3021087646484, "epoch": 0.5013333333333333, "grad_norm": 1.9176504636867993, "kl": 0.8720703125, "learning_rate": 6.252004434783468e-07, "loss": 0.1368, "reward": 1.0755208730697632, "reward_std": 0.38472916185855865, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.669270858168602, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 802.9062805175781, "epoch": 0.5034666666666666, "grad_norm": 6.8553458902973015, "kl": 1.1357421875, "learning_rate": 6.218876040414476e-07, "loss": 0.0643, "reward": 0.8437500447034836, "reward_std": 0.37247762084007263, "rewards/accuracy_reward": 0.23958334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6041666865348816, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 743.0000152587891, "epoch": 0.5056, "grad_norm": 2.8266738347480693, "kl": 1.078125, "learning_rate": 6.185707615865056e-07, "loss": 0.0694, "reward": 0.9713541865348816, "reward_std": 0.4353698194026947, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.638020858168602, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 689.9166870117188, "epoch": 0.5077333333333334, "grad_norm": 1.349416796393404, "kl": 0.74609375, "learning_rate": 6.152501008099008e-07, "loss": 0.0332, "reward": 1.0625000596046448, "reward_std": 0.40154092013835907, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6562500149011612, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 767.4062805175781, "epoch": 0.5098666666666667, "grad_norm": 2.3220730666040454, "kl": 0.6181640625, "learning_rate": 6.119258066206333e-07, "loss": 0.0478, "reward": 1.2656250596046448, "reward_std": 0.4823746606707573, "rewards/accuracy_reward": 0.5312500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750149011612, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 686.7604370117188, "epoch": 0.512, "grad_norm": 2.917475156328743, "kl": 0.8125, "learning_rate": 6.085980641300277e-07, "loss": 0.0955, "reward": 1.111979216337204, "reward_std": 0.4742467850446701, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7057291865348816, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 624.9166870117188, "epoch": 0.5141333333333333, "grad_norm": 3.3550027166193908, "kl": 0.671875, "learning_rate": 6.052670586414254e-07, "loss": 0.0837, "reward": 1.1380208730697632, "reward_std": 0.3583967909216881, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541865348816, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 678.4062652587891, "epoch": 0.5162666666666667, "grad_norm": 3.7580893271794977, "kl": 0.65283203125, "learning_rate": 6.01932975639866e-07, "loss": 0.1068, "reward": 1.0442708730697632, "reward_std": 0.3762342110276222, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6692708432674408, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 664.3854370117188, "epoch": 0.5184, "grad_norm": 2.727664247789793, "kl": 0.82568359375, "learning_rate": 5.985960007817583e-07, "loss": 0.1129, "reward": 1.1848958730697632, "reward_std": 0.46748675405979156, "rewards/accuracy_reward": 0.510416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6744791716337204, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 731.2708740234375, "epoch": 0.5205333333333333, "grad_norm": 5.516653907053587, "kl": 1.6396484375, "learning_rate": 5.952563198845426e-07, "loss": 0.1728, "reward": 0.9843750149011612, "reward_std": 0.40375181287527084, "rewards/accuracy_reward": 0.3645833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6197917014360428, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 660.2083587646484, "epoch": 0.5226666666666666, "grad_norm": 3.4500320027641154, "kl": 1.484375, "learning_rate": 5.91914118916343e-07, "loss": 0.128, "reward": 1.1562500298023224, "reward_std": 0.44681739807128906, "rewards/accuracy_reward": 0.4583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979167014360428, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 693.4479370117188, "epoch": 0.5248, "grad_norm": 2.517559682605646, "kl": 0.7060546875, "learning_rate": 5.885695839856129e-07, "loss": 0.0819, "reward": 1.205729216337204, "reward_std": 0.4331817254424095, "rewards/accuracy_reward": 0.520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.684895858168602, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 611.9166870117188, "epoch": 0.5269333333333334, "grad_norm": 2.287351931421754, "kl": 0.98193359375, "learning_rate": 5.852229013307704e-07, "loss": 0.0271, "reward": 1.0859375298023224, "reward_std": 0.32051411271095276, "rewards/accuracy_reward": 0.38541666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208432674408, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 641.1041793823242, "epoch": 0.5290666666666667, "grad_norm": 4.017373603712157, "kl": 1.3046875, "learning_rate": 5.818742573098282e-07, "loss": 0.0928, "reward": 1.1484375447034836, "reward_std": 0.380577452480793, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.700520858168602, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 703.0104522705078, "epoch": 0.5312, "grad_norm": 8.899673545122152, "kl": 1.818359375, "learning_rate": 5.785238383900171e-07, "loss": 0.1057, "reward": 0.9609375298023224, "reward_std": 0.3786415830254555, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6276041716337204, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 668.1562805175781, "epoch": 0.5333333333333333, "grad_norm": 3.776921543714447, "kl": 1.2470703125, "learning_rate": 5.751718311374019e-07, "loss": 0.1572, "reward": 1.0911458432674408, "reward_std": 0.4785156399011612, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 692.8854370117188, "epoch": 0.5354666666666666, "grad_norm": 5.561615692358444, "kl": 0.7373046875, "learning_rate": 5.718184222064923e-07, "loss": 0.0939, "reward": 1.1145833730697632, "reward_std": 0.3925316818058491, "rewards/accuracy_reward": 0.3541666753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7604166865348816, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 726.2916717529297, "epoch": 0.5376, "grad_norm": 2.396459741540722, "kl": 1.03515625, "learning_rate": 5.684637983298504e-07, "loss": 0.0482, "reward": 1.1484375149011612, "reward_std": 0.47722647339105606, "rewards/accuracy_reward": 0.4375000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375149011612, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 662.1146011352539, "epoch": 0.5397333333333333, "grad_norm": 1.3794330641432073, "kl": 0.79931640625, "learning_rate": 5.65108146307691e-07, "loss": 0.1053, "reward": 1.3203125298023224, "reward_std": 0.37360265105962753, "rewards/accuracy_reward": 0.5312500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625149011612, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 665.2812652587891, "epoch": 0.5418666666666667, "grad_norm": 4.3373374964971285, "kl": 0.6513671875, "learning_rate": 5.617516529974812e-07, "loss": 0.0475, "reward": 1.1640625298023224, "reward_std": 0.42270269989967346, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 659.1562805175781, "epoch": 0.544, "grad_norm": 2.857500389379376, "kl": 0.66552734375, "learning_rate": 5.583945053035345e-07, "loss": 0.0583, "reward": 1.260416716337204, "reward_std": 0.36464181914925575, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000298023224, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 623.3854522705078, "epoch": 0.5461333333333334, "grad_norm": 2.195937822750307, "kl": 0.9833984375, "learning_rate": 5.550368901666031e-07, "loss": 0.0581, "reward": 1.2864583730697632, "reward_std": 0.5202609151601791, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7447916865348816, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 758.0833435058594, "epoch": 0.5482666666666667, "grad_norm": 3.654130355196583, "kl": 0.900634765625, "learning_rate": 5.516789945534687e-07, "loss": 0.0686, "reward": 1.0442708730697632, "reward_std": 0.3605649992823601, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375149011612, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 673.7187576293945, "epoch": 0.5504, "grad_norm": 3.154289332558006, "kl": 0.73095703125, "learning_rate": 5.483210054465313e-07, "loss": 0.0627, "reward": 1.166666716337204, "reward_std": 0.41931793093681335, "rewards/accuracy_reward": 0.38541666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7812500149011612, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 653.3125152587891, "epoch": 0.5525333333333333, "grad_norm": 5.759985243075577, "kl": 2.16796875, "learning_rate": 5.44963109833397e-07, "loss": 0.1535, "reward": 1.1901042014360428, "reward_std": 0.42829064279794693, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875149011612, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 740.5312652587891, "epoch": 0.5546666666666666, "grad_norm": 1.9810573316642937, "kl": 1.783203125, "learning_rate": 5.416054946964657e-07, "loss": 0.105, "reward": 1.1250000298023224, "reward_std": 0.4422023892402649, "rewards/accuracy_reward": 0.4062500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500149011612, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 731.4687652587891, "epoch": 0.5568, "grad_norm": 1.7507988717457068, "kl": 0.9384765625, "learning_rate": 5.382483470025188e-07, "loss": 0.0749, "reward": 1.0312500447034836, "reward_std": 0.3682085946202278, "rewards/accuracy_reward": 0.28125000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000298023224, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 667.3437805175781, "epoch": 0.5589333333333333, "grad_norm": 1.4363727179496517, "kl": 0.61376953125, "learning_rate": 5.34891853692309e-07, "loss": 0.1173, "reward": 1.3385416865348816, "reward_std": 0.32814982905983925, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7656250298023224, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 688.8958587646484, "epoch": 0.5610666666666667, "grad_norm": 2.577013694965925, "kl": 1.4384765625, "learning_rate": 5.315362016701495e-07, "loss": 0.0793, "reward": 1.2578125596046448, "reward_std": 0.47568681836128235, "rewards/accuracy_reward": 0.5312500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 574.2291717529297, "epoch": 0.5632, "grad_norm": 2.308481592871747, "kl": 0.88916015625, "learning_rate": 5.281815777935076e-07, "loss": -0.0186, "reward": 1.4843750596046448, "reward_std": 0.42332855612039566, "rewards/accuracy_reward": 0.6979167014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.786458358168602, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 738.8229370117188, "epoch": 0.5653333333333334, "grad_norm": 3.6223448115920025, "kl": 1.568115234375, "learning_rate": 5.248281688625984e-07, "loss": 0.1697, "reward": 1.075520858168602, "reward_std": 0.3689369484782219, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 725.0104370117188, "epoch": 0.5674666666666667, "grad_norm": 2.8406366622019723, "kl": 0.662109375, "learning_rate": 5.21476161609983e-07, "loss": 0.0583, "reward": 1.2812500596046448, "reward_std": 0.3292672336101532, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.770833358168602, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 604.2500152587891, "epoch": 0.5696, "grad_norm": 3.8506726775482414, "kl": 1.0810546875, "learning_rate": 5.181257426901719e-07, "loss": 0.0963, "reward": 1.385416716337204, "reward_std": 0.48288238793611526, "rewards/accuracy_reward": 0.6354166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000149011612, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 622.3541870117188, "epoch": 0.5717333333333333, "grad_norm": 5.330990026360423, "kl": 1.537109375, "learning_rate": 5.147770986692298e-07, "loss": 0.1471, "reward": 1.1484375149011612, "reward_std": 0.3771478980779648, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 614.8333435058594, "epoch": 0.5738666666666666, "grad_norm": 5.401106666341018, "kl": 1.873046875, "learning_rate": 5.114304160143872e-07, "loss": 0.0376, "reward": 1.1484375149011612, "reward_std": 0.40641965717077255, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875298023224, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 681.125, "epoch": 0.576, "grad_norm": 2.132821179572739, "kl": 1.201171875, "learning_rate": 5.080858810836569e-07, "loss": 0.1105, "reward": 1.1458333730697632, "reward_std": 0.35508203506469727, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 790.9062805175781, "epoch": 0.5781333333333334, "grad_norm": 7.649321265767852, "kl": 2.1328125, "learning_rate": 5.047436801154574e-07, "loss": 0.0877, "reward": 0.8098958432674408, "reward_std": 0.3416658490896225, "rewards/accuracy_reward": 0.1770833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125149011612, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 744.7916870117188, "epoch": 0.5802666666666667, "grad_norm": 85.28665013162654, "kl": 5.845703125, "learning_rate": 5.014039992182416e-07, "loss": 0.3261, "reward": 1.221354216337204, "reward_std": 0.4842746704816818, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 732.8020935058594, "epoch": 0.5824, "grad_norm": 3.0599918049429764, "kl": 1.1376953125, "learning_rate": 4.98067024360134e-07, "loss": 0.0921, "reward": 1.1354167014360428, "reward_std": 0.4286448433995247, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 774.8541870117188, "epoch": 0.5845333333333333, "grad_norm": 3.424931636155576, "kl": 1.0224609375, "learning_rate": 4.947329413585745e-07, "loss": 0.0897, "reward": 1.1328125298023224, "reward_std": 0.4221769720315933, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 776.0833587646484, "epoch": 0.5866666666666667, "grad_norm": 2.225091632923016, "kl": 1.328125, "learning_rate": 4.914019358699724e-07, "loss": 0.0384, "reward": 1.098958358168602, "reward_std": 0.3936329632997513, "rewards/accuracy_reward": 0.3437500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7552083432674408, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 789.9687805175781, "epoch": 0.5888, "grad_norm": 2.1457006633806324, "kl": 1.3408203125, "learning_rate": 4.880741933793668e-07, "loss": 0.1124, "reward": 0.8880208730697632, "reward_std": 0.44243620336055756, "rewards/accuracy_reward": 0.2604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6276042014360428, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 611.0104370117188, "epoch": 0.5909333333333333, "grad_norm": 1.9977027582605016, "kl": 0.7919921875, "learning_rate": 4.847498991900991e-07, "loss": 0.145, "reward": 1.2786458730697632, "reward_std": 0.5421346426010132, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7369791716337204, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 649.6354522705078, "epoch": 0.5930666666666666, "grad_norm": 2.205181301769632, "kl": 0.869140625, "learning_rate": 4.814292384134943e-07, "loss": 0.0844, "reward": 1.190104216337204, "reward_std": 0.5330179780721664, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.700520858168602, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 725.1875152587891, "epoch": 0.5952, "grad_norm": 3.808352664658485, "kl": 1.3873291015625, "learning_rate": 4.781123959585526e-07, "loss": 0.0791, "reward": 1.0364583730697632, "reward_std": 0.43167005479335785, "rewards/accuracy_reward": 0.3645833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 746.4479370117188, "epoch": 0.5973333333333334, "grad_norm": 4.134001817100567, "kl": 1.68798828125, "learning_rate": 4.7479955652165315e-07, "loss": 0.1172, "reward": 1.1901041865348816, "reward_std": 0.3821878246963024, "rewards/accuracy_reward": 0.4687500260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213542014360428, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 674.4375152587891, "epoch": 0.5994666666666667, "grad_norm": 2.24511147535919, "kl": 0.7109375, "learning_rate": 4.714909045762702e-07, "loss": 0.0876, "reward": 1.104166716337204, "reward_std": 0.4187440648674965, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 691.1562652587891, "epoch": 0.6016, "grad_norm": 2.122169433416534, "kl": 0.940185546875, "learning_rate": 4.681866243626992e-07, "loss": 0.1236, "reward": 1.2083334028720856, "reward_std": 0.46300840377807617, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 690.8333435058594, "epoch": 0.6037333333333333, "grad_norm": 4.58226981588225, "kl": 1.330078125, "learning_rate": 4.6488689987779893e-07, "loss": 0.0949, "reward": 1.299479216337204, "reward_std": 0.5550074055790901, "rewards/accuracy_reward": 0.541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125298023224, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 674.6041870117188, "epoch": 0.6058666666666667, "grad_norm": 2.0045221335656356, "kl": 0.68896484375, "learning_rate": 4.615919148647448e-07, "loss": 0.0382, "reward": 1.2916666865348816, "reward_std": 0.3872656598687172, "rewards/accuracy_reward": 0.5312500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7604166865348816, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 596.6562652587891, "epoch": 0.608, "grad_norm": 2.388321430701261, "kl": 0.82421875, "learning_rate": 4.583018528027975e-07, "loss": 0.0618, "reward": 1.2109375298023224, "reward_std": 0.386493518948555, "rewards/accuracy_reward": 0.4479166939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.763020858168602, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 654.4687805175781, "epoch": 0.6101333333333333, "grad_norm": 1.5558415219485469, "kl": 0.814453125, "learning_rate": 4.550168968970869e-07, "loss": 0.0904, "reward": 1.1250000596046448, "reward_std": 0.35980185866355896, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000149011612, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 700.6354370117188, "epoch": 0.6122666666666666, "grad_norm": 2.291619988144414, "kl": 0.670166015625, "learning_rate": 4.5173723006840856e-07, "loss": 0.0508, "reward": 1.0911458730697632, "reward_std": 0.34144046902656555, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 736.4062805175781, "epoch": 0.6144, "grad_norm": 1.9834242115432024, "kl": 0.6470947265625, "learning_rate": 4.484630349430397e-07, "loss": 0.0779, "reward": 1.1953125298023224, "reward_std": 0.46002406626939774, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958730697632, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 789.8021087646484, "epoch": 0.6165333333333334, "grad_norm": 2.3855167211416366, "kl": 0.6494140625, "learning_rate": 4.451944938425689e-07, "loss": 0.0549, "reward": 1.0781250447034836, "reward_std": 0.37145940214395523, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583432674408, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 674.1041870117188, "epoch": 0.6186666666666667, "grad_norm": 1.918889621315985, "kl": 0.78369140625, "learning_rate": 4.419317887737434e-07, "loss": 0.0668, "reward": 1.3151041865348816, "reward_std": 0.45681414008140564, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7734375149011612, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 625.1562805175781, "epoch": 0.6208, "grad_norm": 4.355501770676797, "kl": 0.7783203125, "learning_rate": 4.386751014183351e-07, "loss": 0.1746, "reward": 1.3515625298023224, "reward_std": 0.42713654786348343, "rewards/accuracy_reward": 0.5625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625149011612, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 596.4270935058594, "epoch": 0.6229333333333333, "grad_norm": 2.342168905969303, "kl": 1.732421875, "learning_rate": 4.354246131230226e-07, "loss": 0.2066, "reward": 1.2239583730697632, "reward_std": 0.5141923800110817, "rewards/accuracy_reward": 0.4687500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7552083432674408, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 733.4270935058594, "epoch": 0.6250666666666667, "grad_norm": 3.4490712051569914, "kl": 1.337890625, "learning_rate": 4.3218050488929415e-07, "loss": 0.0777, "reward": 1.0494791865348816, "reward_std": 0.36260855570435524, "rewards/accuracy_reward": 0.35416667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 674.5625152587891, "epoch": 0.6272, "grad_norm": 8.813117801633163, "kl": 2.291015625, "learning_rate": 4.289429573633672e-07, "loss": 0.1296, "reward": 1.1614583730697632, "reward_std": 0.48469968885183334, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750149011612, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 765.7812805175781, "epoch": 0.6293333333333333, "grad_norm": 2.829484028197823, "kl": 1.390625, "learning_rate": 4.257121508261311e-07, "loss": 0.0696, "reward": 0.911458358168602, "reward_std": 0.34382112324237823, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822917014360428, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 688.0000152587891, "epoch": 0.6314666666666666, "grad_norm": 3.8216063069299366, "kl": 1.1865234375, "learning_rate": 4.2248826518310663e-07, "loss": 0.0771, "reward": 1.1796875447034836, "reward_std": 0.5371479094028473, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 676.5937652587891, "epoch": 0.6336, "grad_norm": 7.06830505996, "kl": 2.5546875, "learning_rate": 4.1927147995442925e-07, "loss": 0.1405, "reward": 1.221354216337204, "reward_std": 0.4441913291811943, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541865348816, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 648.8020935058594, "epoch": 0.6357333333333334, "grad_norm": 3.04039676189216, "kl": 1.435546875, "learning_rate": 4.160619742648517e-07, "loss": 0.137, "reward": 1.2421875596046448, "reward_std": 0.4874294400215149, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7734375149011612, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 711.3541793823242, "epoch": 0.6378666666666667, "grad_norm": 2.8444606747983294, "kl": 1.380859375, "learning_rate": 4.128599268337699e-07, "loss": 0.1405, "reward": 1.1197917014360428, "reward_std": 0.5157921761274338, "rewards/accuracy_reward": 0.4270833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6927083432674408, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 739.6354370117188, "epoch": 0.64, "grad_norm": 7.001915278918098, "kl": 1.484375, "learning_rate": 4.096655159652717e-07, "loss": 0.1311, "reward": 1.1901041865348816, "reward_std": 0.4807375743985176, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708432674408, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 676.8333435058594, "epoch": 0.6421333333333333, "grad_norm": 2.1230958459406404, "kl": 1.44921875, "learning_rate": 4.0647891953820677e-07, "loss": 0.0895, "reward": 1.1640625149011612, "reward_std": 0.46265844255685806, "rewards/accuracy_reward": 0.4270833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7369791865348816, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 647.0104370117188, "epoch": 0.6442666666666667, "grad_norm": 5.592858764517026, "kl": 1.2412109375, "learning_rate": 4.0330031499628327e-07, "loss": 0.0744, "reward": 1.1666667014360428, "reward_std": 0.4661066606640816, "rewards/accuracy_reward": 0.4479166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500149011612, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 615.4583435058594, "epoch": 0.6464, "grad_norm": 2.9391531442419336, "kl": 1.37890625, "learning_rate": 4.00129879338185e-07, "loss": 0.051, "reward": 1.3697917461395264, "reward_std": 0.40326759219169617, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7968750298023224, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 675.8333435058594, "epoch": 0.6485333333333333, "grad_norm": 7.164162181480773, "kl": 2.1025390625, "learning_rate": 3.969677891077169e-07, "loss": 0.1793, "reward": 1.0364583730697632, "reward_std": 0.436903640627861, "rewards/accuracy_reward": 0.3437500111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.692708358168602, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 654.8021087646484, "epoch": 0.6506666666666666, "grad_norm": 3.704485718269989, "kl": 1.09619140625, "learning_rate": 3.938142203839739e-07, "loss": 0.0982, "reward": 1.229166716337204, "reward_std": 0.47030629962682724, "rewards/accuracy_reward": 0.5312500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 734.5208587646484, "epoch": 0.6528, "grad_norm": 3.8971154349180575, "kl": 1.0986328125, "learning_rate": 3.906693487715358e-07, "loss": 0.1313, "reward": 1.138020858168602, "reward_std": 0.47273699939250946, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.731770858168602, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 671.8229370117188, "epoch": 0.6549333333333334, "grad_norm": 4.1318161142771395, "kl": 1.1552734375, "learning_rate": 3.875333493906889e-07, "loss": 0.0962, "reward": 1.1223958730697632, "reward_std": 0.5251179039478302, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7057291865348816, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 635.8854370117188, "epoch": 0.6570666666666667, "grad_norm": 2.0719389846315903, "kl": 1.109375, "learning_rate": 3.844063968676747e-07, "loss": 0.1291, "reward": 1.1276042014360428, "reward_std": 0.47504569590091705, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708432674408, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 567.5521011352539, "epoch": 0.6592, "grad_norm": 7.764838278480252, "kl": 0.81787109375, "learning_rate": 3.8128866532496575e-07, "loss": 0.0984, "reward": 1.3619791865348816, "reward_std": 0.4738384932279587, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7786458432674408, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 686.8854370117188, "epoch": 0.6613333333333333, "grad_norm": 6.011262528901589, "kl": 0.507568359375, "learning_rate": 3.7818032837157e-07, "loss": 0.0566, "reward": 1.3359375298023224, "reward_std": 0.4312174804508686, "rewards/accuracy_reward": 0.5625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7734375149011612, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 779.8229217529297, "epoch": 0.6634666666666666, "grad_norm": 1.9236708416924095, "kl": 1.16015625, "learning_rate": 3.7508155909336324e-07, "loss": 0.0663, "reward": 0.9791667014360428, "reward_std": 0.4290865398943424, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.677083358168602, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 691.8541870117188, "epoch": 0.6656, "grad_norm": 4.249549102956239, "kl": 1.0087890625, "learning_rate": 3.719925300434511e-07, "loss": 0.0839, "reward": 1.213541716337204, "reward_std": 0.4954788386821747, "rewards/accuracy_reward": 0.4791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 733.5833587646484, "epoch": 0.6677333333333333, "grad_norm": 6.248005893026083, "kl": 1.12109375, "learning_rate": 3.6891341323256044e-07, "loss": 0.1242, "reward": 1.1536458730697632, "reward_std": 0.44258614629507065, "rewards/accuracy_reward": 0.4166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7369791865348816, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 641.7500305175781, "epoch": 0.6698666666666667, "grad_norm": 3.0058889714754575, "kl": 1.474609375, "learning_rate": 3.6584438011946093e-07, "loss": 0.0957, "reward": 1.1041667014360428, "reward_std": 0.5252336710691452, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 695.4791870117188, "epoch": 0.672, "grad_norm": 1.4454918734522983, "kl": 1.283203125, "learning_rate": 3.627856016014177e-07, "loss": 0.044, "reward": 1.0625000298023224, "reward_std": 0.4147951230406761, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 702.3125152587891, "epoch": 0.6741333333333334, "grad_norm": 3.5017500820603495, "kl": 1.1630859375, "learning_rate": 3.5973724800467487e-07, "loss": 0.0894, "reward": 0.997395858168602, "reward_std": 0.5040253773331642, "rewards/accuracy_reward": 0.2708333460614085, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 795.8437805175781, "epoch": 0.6762666666666667, "grad_norm": 2.726416422984257, "kl": 1.3046875, "learning_rate": 3.5669948907497106e-07, "loss": 0.0359, "reward": 0.9765625149011612, "reward_std": 0.5282137468457222, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.653645858168602, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 811.9062652587891, "epoch": 0.6784, "grad_norm": 2.1169915873642635, "kl": 1.3486328125, "learning_rate": 3.536724939680873e-07, "loss": 0.0274, "reward": 0.966145858168602, "reward_std": 0.43981292843818665, "rewards/accuracy_reward": 0.2604166707023978, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7057291716337204, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 729.2812652587891, "epoch": 0.6805333333333333, "grad_norm": 4.476093612695441, "kl": 1.8857421875, "learning_rate": 3.506564312404274e-07, "loss": 0.0958, "reward": 0.9791666865348816, "reward_std": 0.4137794151902199, "rewards/accuracy_reward": 0.2916666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000149011612, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 694.7812652587891, "epoch": 0.6826666666666666, "grad_norm": 3.8611937456604775, "kl": 1.583984375, "learning_rate": 3.476514688396326e-07, "loss": 0.0061, "reward": 1.151041716337204, "reward_std": 0.4715312048792839, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.692708358168602, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 641.6354370117188, "epoch": 0.6848, "grad_norm": 2.2123043385070162, "kl": 1.3046875, "learning_rate": 3.446577740952291e-07, "loss": 0.0959, "reward": 1.114583358168602, "reward_std": 0.5834785029292107, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 748.6771087646484, "epoch": 0.6869333333333333, "grad_norm": 3.373430717613711, "kl": 1.19140625, "learning_rate": 3.416755137093095e-07, "loss": 0.0787, "reward": 1.0859375298023224, "reward_std": 0.4723443537950516, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6692708432674408, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 737.6146087646484, "epoch": 0.6890666666666667, "grad_norm": 3.3096316727110078, "kl": 1.2021484375, "learning_rate": 3.387048537472521e-07, "loss": 0.0327, "reward": 1.0078125298023224, "reward_std": 0.3858821913599968, "rewards/accuracy_reward": 0.3333333386108279, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6744792014360428, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 694.6771087646484, "epoch": 0.6912, "grad_norm": 1.7472386206877715, "kl": 1.1787109375, "learning_rate": 3.3574595962847227e-07, "loss": 0.0701, "reward": 1.1171875596046448, "reward_std": 0.4745071604847908, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875298023224, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 730.9583435058594, "epoch": 0.6933333333333334, "grad_norm": 2.2603113757427393, "kl": 1.01025390625, "learning_rate": 3.327989961172112e-07, "loss": 0.033, "reward": 1.213541716337204, "reward_std": 0.4763711243867874, "rewards/accuracy_reward": 0.4895833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583432674408, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 714.6562652587891, "epoch": 0.6954666666666667, "grad_norm": 5.159537210095775, "kl": 2.650390625, "learning_rate": 3.2986412731336175e-07, "loss": 0.1354, "reward": 0.9687500298023224, "reward_std": 0.4482138305902481, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645833358168602, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 655.8125152587891, "epoch": 0.6976, "grad_norm": 2.0645410683703584, "kl": 1.736328125, "learning_rate": 3.2694151664332966e-07, "loss": 0.001, "reward": 1.1093750447034836, "reward_std": 0.5294991061091423, "rewards/accuracy_reward": 0.4270833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 707.5312805175781, "epoch": 0.6997333333333333, "grad_norm": 2.172931619200017, "kl": 1.5546875, "learning_rate": 3.240313268509345e-07, "loss": 0.1471, "reward": 1.0208333730697632, "reward_std": 0.477165549993515, "rewards/accuracy_reward": 0.3437500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6770833432674408, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 629.5833587646484, "epoch": 0.7018666666666666, "grad_norm": 2.0240169487056354, "kl": 1.0498046875, "learning_rate": 3.211337199883467e-07, "loss": 0.0698, "reward": 1.4375, "reward_std": 0.5091161876916885, "rewards/accuracy_reward": 0.6458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7916666865348816, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 730.8125152587891, "epoch": 0.704, "grad_norm": 2.315013907534432, "kl": 1.71875, "learning_rate": 3.182488574070632e-07, "loss": 0.0869, "reward": 0.9453125298023224, "reward_std": 0.5973611772060394, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6119792014360428, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 676.1146087646484, "epoch": 0.7061333333333333, "grad_norm": 2.457999078731587, "kl": 1.6494140625, "learning_rate": 3.153768997489239e-07, "loss": 0.0853, "reward": 1.1015625149011612, "reward_std": 0.48719264566898346, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7057291865348816, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 736.5312805175781, "epoch": 0.7082666666666667, "grad_norm": 2.6010104787226003, "kl": 1.080078125, "learning_rate": 3.1251800693716547e-07, "loss": 0.1232, "reward": 1.0963542014360428, "reward_std": 0.5459525063633919, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.700520858168602, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 669.5416717529297, "epoch": 0.7104, "grad_norm": 1.8362139405765112, "kl": 1.056640625, "learning_rate": 3.0967233816751655e-07, "loss": -0.0228, "reward": 1.3125000298023224, "reward_std": 0.55170738697052, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7708333432674408, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 693.0833587646484, "epoch": 0.7125333333333334, "grad_norm": 2.5638786630495436, "kl": 1.7734375, "learning_rate": 3.0684005189933314e-07, "loss": 0.0765, "reward": 1.1458333730697632, "reward_std": 0.4304089844226837, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.739583358168602, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 754.6458435058594, "epoch": 0.7146666666666667, "grad_norm": 1.8865381891279744, "kl": 0.892578125, "learning_rate": 3.0402130584677456e-07, "loss": 0.1089, "reward": 0.997395858168602, "reward_std": 0.4055178463459015, "rewards/accuracy_reward": 0.3020833469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 771.9687652587891, "epoch": 0.7168, "grad_norm": 1.1455584808324157, "kl": 1.21875, "learning_rate": 3.012162569700208e-07, "loss": 0.0055, "reward": 1.0234375596046448, "reward_std": 0.4199650138616562, "rewards/accuracy_reward": 0.354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.669270858168602, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 712.8333435058594, "epoch": 0.7189333333333333, "grad_norm": 2.342748172421835, "kl": 1.0478515625, "learning_rate": 2.984250614665339e-07, "loss": 0.0331, "reward": 1.0572917014360428, "reward_std": 0.5415750741958618, "rewards/accuracy_reward": 0.3750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916716337204, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 795.4062652587891, "epoch": 0.7210666666666666, "grad_norm": 1.500532269463762, "kl": 1.345703125, "learning_rate": 2.9564787476235823e-07, "loss": 0.0801, "reward": 0.9765625447034836, "reward_std": 0.41543692350387573, "rewards/accuracy_reward": 0.2916666744276881, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6848958432674408, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 786.4167022705078, "epoch": 0.7232, "grad_norm": 3.819852222206904, "kl": 0.802734375, "learning_rate": 2.9288485150346726e-07, "loss": 0.0624, "reward": 1.013020858168602, "reward_std": 0.45187023282051086, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375298023224, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 792.3125152587891, "epoch": 0.7253333333333334, "grad_norm": 2.0201043749021426, "kl": 1.5703125, "learning_rate": 2.901361455471508e-07, "loss": 0.0987, "reward": 0.963541716337204, "reward_std": 0.44301638007164, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 734.0104217529297, "epoch": 0.7274666666666667, "grad_norm": 1.3457584943918326, "kl": 0.85546875, "learning_rate": 2.87401909953449e-07, "loss": 0.0389, "reward": 1.0807292014360428, "reward_std": 0.41036995500326157, "rewards/accuracy_reward": 0.3333333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958432674408, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 659.6041870117188, "epoch": 0.7296, "grad_norm": 2.879175690681913, "kl": 1.6728515625, "learning_rate": 2.8468229697662803e-07, "loss": 0.1076, "reward": 1.2031250596046448, "reward_std": 0.39927640557289124, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.786458358168602, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 753.3958435058594, "epoch": 0.7317333333333333, "grad_norm": 2.327499375188182, "kl": 0.76953125, "learning_rate": 2.819774580567027e-07, "loss": 0.1083, "reward": 1.0989583730697632, "reward_std": 0.41911373659968376, "rewards/accuracy_reward": 0.3645833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 770.8854370117188, "epoch": 0.7338666666666667, "grad_norm": 1.9515342429633127, "kl": 1.1884765625, "learning_rate": 2.792875438110033e-07, "loss": 0.0376, "reward": 1.0, "reward_std": 0.4482066258788109, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 745.03125, "epoch": 0.736, "grad_norm": 2.778715625313689, "kl": 1.2744140625, "learning_rate": 2.766127040257884e-07, "loss": 0.0789, "reward": 1.153645858168602, "reward_std": 0.43238315731287, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7682292014360428, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 626.9166793823242, "epoch": 0.7381333333333333, "grad_norm": 2.061952546668428, "kl": 0.95703125, "learning_rate": 2.739530876479048e-07, "loss": 0.0938, "reward": 1.3880208730697632, "reward_std": 0.42479727417230606, "rewards/accuracy_reward": 0.5729166939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8151041865348816, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 629.4166717529297, "epoch": 0.7402666666666666, "grad_norm": 2.183821055027836, "kl": 0.6650390625, "learning_rate": 2.7130884277649214e-07, "loss": 0.0857, "reward": 1.3437500447034836, "reward_std": 0.4782296419143677, "rewards/accuracy_reward": 0.5729167014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7708333432674408, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 736.6354370117188, "epoch": 0.7424, "grad_norm": 1.9371419950412827, "kl": 1.375, "learning_rate": 2.686801166547377e-07, "loss": 0.1098, "reward": 1.0026041865348816, "reward_std": 0.49701040238142014, "rewards/accuracy_reward": 0.3125000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6901041865348816, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 762.0625152587891, "epoch": 0.7445333333333334, "grad_norm": 1.7558253394051107, "kl": 1.203125, "learning_rate": 2.6606705566167674e-07, "loss": 0.0869, "reward": 1.143229216337204, "reward_std": 0.46467429399490356, "rewards/accuracy_reward": 0.4166666818782687, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 753.2812652587891, "epoch": 0.7466666666666667, "grad_norm": 1.9709169916263403, "kl": 1.4423828125, "learning_rate": 2.6346980530404004e-07, "loss": 0.122, "reward": 1.0130208730697632, "reward_std": 0.4813590347766876, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208432674408, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 723.7187652587891, "epoch": 0.7488, "grad_norm": 4.56297873066424, "kl": 2.19921875, "learning_rate": 2.6088851020815384e-07, "loss": 0.1464, "reward": 1.1302083432674408, "reward_std": 0.47396688908338547, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.723958358168602, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 747.7812652587891, "epoch": 0.7509333333333333, "grad_norm": 1.9631494506608835, "kl": 0.8564453125, "learning_rate": 2.5832331411188474e-07, "loss": 0.1031, "reward": 1.330729216337204, "reward_std": 0.5036925226449966, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 765.1771087646484, "epoch": 0.7530666666666667, "grad_norm": 2.3865628511024988, "kl": 0.70703125, "learning_rate": 2.557743598566361e-07, "loss": 0.0598, "reward": 1.2031250298023224, "reward_std": 0.38255368173122406, "rewards/accuracy_reward": 0.4479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.755208358168602, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 746.8750152587891, "epoch": 0.7552, "grad_norm": 6.9166625658423655, "kl": 1.998046875, "learning_rate": 2.5324178937939436e-07, "loss": 0.1578, "reward": 1.174479216337204, "reward_std": 0.5697176456451416, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 639.7916870117188, "epoch": 0.7573333333333333, "grad_norm": 2.425626939261767, "kl": 1.46435546875, "learning_rate": 2.507257437048249e-07, "loss": 0.0574, "reward": 1.2734375298023224, "reward_std": 0.3452693969011307, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7942708432674408, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 726.03125, "epoch": 0.7594666666666666, "grad_norm": 3.260344436398864, "kl": 1.20703125, "learning_rate": 2.482263629374197e-07, "loss": 0.0652, "reward": 1.0000000149011612, "reward_std": 0.49797503650188446, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 739.7500152587891, "epoch": 0.7616, "grad_norm": 3.207979066227694, "kl": 1.52734375, "learning_rate": 2.4574378625369526e-07, "loss": 0.1102, "reward": 1.096354216337204, "reward_std": 0.5646106451749802, "rewards/accuracy_reward": 0.3958333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.700520858168602, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 674.0312576293945, "epoch": 0.7637333333333334, "grad_norm": 5.24696288876381, "kl": 1.8115234375, "learning_rate": 2.432781518944425e-07, "loss": 0.1214, "reward": 1.2473958879709244, "reward_std": 0.3957555927336216, "rewards/accuracy_reward": 0.4895833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125149011612, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 712.7396087646484, "epoch": 0.7658666666666667, "grad_norm": 2.503201463336373, "kl": 1.0009765625, "learning_rate": 2.408295971570297e-07, "loss": 0.0721, "reward": 1.2005208730697632, "reward_std": 0.5229354798793793, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7630208730697632, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 740.5937805175781, "epoch": 0.768, "grad_norm": 2.7759740395623105, "kl": 2.009765625, "learning_rate": 2.3839825838775598e-07, "loss": 0.0173, "reward": 1.2500000447034836, "reward_std": 0.5089648514986038, "rewards/accuracy_reward": 0.5104166939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.739583358168602, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 739.4583435058594, "epoch": 0.7701333333333333, "grad_norm": 3.2691753356813398, "kl": 1.5390625, "learning_rate": 2.359842709742603e-07, "loss": 0.1274, "reward": 1.0546875447034836, "reward_std": 0.4684048518538475, "rewards/accuracy_reward": 0.36458334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6901041865348816, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 737.2812652587891, "epoch": 0.7722666666666667, "grad_norm": 3.5068196972207435, "kl": 1.0185546875, "learning_rate": 2.3358776933798163e-07, "loss": 0.1049, "reward": 1.2187500298023224, "reward_std": 0.5481763109564781, "rewards/accuracy_reward": 0.4687500298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7500000149011612, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 750.8229522705078, "epoch": 0.7744, "grad_norm": 3.44354047068292, "kl": 1.3896484375, "learning_rate": 2.3120888692667355e-07, "loss": 0.0889, "reward": 0.9791666865348816, "reward_std": 0.4294763505458832, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.645833358168602, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 793.1770935058594, "epoch": 0.7765333333333333, "grad_norm": 3.8786939873244735, "kl": 1.611328125, "learning_rate": 2.2884775620697396e-07, "loss": 0.0847, "reward": 0.8437500298023224, "reward_std": 0.41653449833393097, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5937500074505806, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 657.4583587646484, "epoch": 0.7786666666666666, "grad_norm": 2.7534361801390785, "kl": 1.38671875, "learning_rate": 2.2650450865702873e-07, "loss": 0.0728, "reward": 1.0989583730697632, "reward_std": 0.6507840603590012, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250149011612, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 730.0208435058594, "epoch": 0.7808, "grad_norm": 1.7535269783976952, "kl": 0.80224609375, "learning_rate": 2.2417927475916948e-07, "loss": 0.0451, "reward": 1.2161458730697632, "reward_std": 0.3546758443117142, "rewards/accuracy_reward": 0.4583333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125298023224, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 774.8125305175781, "epoch": 0.7829333333333334, "grad_norm": 3.3253787412222624, "kl": 0.931640625, "learning_rate": 2.218721839926493e-07, "loss": 0.1163, "reward": 1.0703125447034836, "reward_std": 0.4362204819917679, "rewards/accuracy_reward": 0.3229166753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958432674408, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 752.9375152587891, "epoch": 0.7850666666666667, "grad_norm": 1.4617385019781453, "kl": 1.5615234375, "learning_rate": 2.1958336482643119e-07, "loss": 0.08, "reward": 0.9192708805203438, "reward_std": 0.4304827004671097, "rewards/accuracy_reward": 0.27083334419876337, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375149011612, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 679.8958587646484, "epoch": 0.7872, "grad_norm": 2.585021511333178, "kl": 1.0693359375, "learning_rate": 2.173129447120354e-07, "loss": 0.0458, "reward": 1.1744792014360428, "reward_std": 0.34926126152276993, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125149011612, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 778.9062805175781, "epoch": 0.7893333333333333, "grad_norm": 1.55450645453269, "kl": 0.763671875, "learning_rate": 2.1506105007644215e-07, "loss": 0.084, "reward": 1.0677083879709244, "reward_std": 0.340598925948143, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750149011612, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 737.7500152587891, "epoch": 0.7914666666666667, "grad_norm": 4.097252915121006, "kl": 0.8349609375, "learning_rate": 2.1282780631505106e-07, "loss": 0.0996, "reward": 1.2265625596046448, "reward_std": 0.4554222673177719, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7682291865348816, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 750.7708435058594, "epoch": 0.7936, "grad_norm": 2.7685818015071133, "kl": 0.6591796875, "learning_rate": 2.106133377846996e-07, "loss": 0.079, "reward": 1.1250000596046448, "reward_std": 0.3624304011464119, "rewards/accuracy_reward": 0.3645833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7604166865348816, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 706.5520935058594, "epoch": 0.7957333333333333, "grad_norm": 1.4575123252714652, "kl": 0.8974609375, "learning_rate": 2.0841776779673712e-07, "loss": 0.1091, "reward": 1.119791716337204, "reward_std": 0.39200131222605705, "rewards/accuracy_reward": 0.39583334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.723958358168602, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 712.9583435058594, "epoch": 0.7978666666666666, "grad_norm": 2.232824880577458, "kl": 1.0927734375, "learning_rate": 2.0624121861015957e-07, "loss": 0.0676, "reward": 1.1640625298023224, "reward_std": 0.38127563893795013, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958432674408, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 625.6771087646484, "epoch": 0.8, "grad_norm": 5.941213344681978, "kl": 1.75, "learning_rate": 2.040838114248009e-07, "loss": 0.0789, "reward": 1.3958333730697632, "reward_std": 0.35454390197992325, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7916666865348816, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 737.3020935058594, "epoch": 0.8021333333333334, "grad_norm": 5.357951916612967, "kl": 1.1103515625, "learning_rate": 2.019456663745839e-07, "loss": 0.1002, "reward": 1.190104216337204, "reward_std": 0.45377591252326965, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875149011612, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 647.6146087646484, "epoch": 0.8042666666666667, "grad_norm": 2.4257869176075304, "kl": 1.609375, "learning_rate": 1.9982690252083124e-07, "loss": 0.0367, "reward": 1.2239583730697632, "reward_std": 0.5055340826511383, "rewards/accuracy_reward": 0.5000000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583432674408, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 698.1041870117188, "epoch": 0.8064, "grad_norm": 1.2589748647570982, "kl": 0.4638671875, "learning_rate": 1.9772763784563515e-07, "loss": 0.0578, "reward": 1.354166716337204, "reward_std": 0.40980012714862823, "rewards/accuracy_reward": 0.5312500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8229166865348816, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 644.2500305175781, "epoch": 0.8085333333333333, "grad_norm": 1.6568574980602617, "kl": 0.8720703125, "learning_rate": 1.956479892452878e-07, "loss": 0.1696, "reward": 1.2265625149011612, "reward_std": 0.4330429956316948, "rewards/accuracy_reward": 0.4583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7682291865348816, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 812.9062652587891, "epoch": 0.8106666666666666, "grad_norm": 4.6751933457063455, "kl": 1.73291015625, "learning_rate": 1.9358807252377224e-07, "loss": 0.1125, "reward": 0.911458358168602, "reward_std": 0.3435791879892349, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 705.5729217529297, "epoch": 0.8128, "grad_norm": 2.202804636105119, "kl": 1.5283203125, "learning_rate": 1.915480023863134e-07, "loss": 0.119, "reward": 1.2317708730697632, "reward_std": 0.4287576675415039, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7630208432674408, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 772.1041870117188, "epoch": 0.8149333333333333, "grad_norm": 1.578475813955147, "kl": 0.9765625, "learning_rate": 1.895278924329914e-07, "loss": 0.1039, "reward": 1.0156250596046448, "reward_std": 0.5112209767103195, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 663.3021087646484, "epoch": 0.8170666666666667, "grad_norm": 1.7491393019341375, "kl": 0.58837890625, "learning_rate": 1.8752785515241533e-07, "loss": 0.0633, "reward": 1.3177083730697632, "reward_std": 0.4606628455221653, "rewards/accuracy_reward": 0.5416666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7760417014360428, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 719.3541870117188, "epoch": 0.8192, "grad_norm": 2.126480419779782, "kl": 1.0810546875, "learning_rate": 1.8554800191545954e-07, "loss": 0.0575, "reward": 1.1562500298023224, "reward_std": 0.35498932003974915, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291666865348816, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 742.5937652587891, "epoch": 0.8213333333333334, "grad_norm": 3.178627789937521, "kl": 1.314453125, "learning_rate": 1.8358844296906213e-07, "loss": 0.0802, "reward": 1.0807291865348816, "reward_std": 0.4337238222360611, "rewards/accuracy_reward": 0.3645833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 721.6458587646484, "epoch": 0.8234666666666667, "grad_norm": 22.434183591624898, "kl": 1.8828125, "learning_rate": 1.816492874300856e-07, "loss": 0.1242, "reward": 1.166666716337204, "reward_std": 0.5493014454841614, "rewards/accuracy_reward": 0.4375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291667014360428, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 684.4687805175781, "epoch": 0.8256, "grad_norm": 2.548088556426522, "kl": 0.99267578125, "learning_rate": 1.7973064327924126e-07, "loss": 0.0546, "reward": 1.1875000298023224, "reward_std": 0.5050683170557022, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7604166865348816, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 734.2916717529297, "epoch": 0.8277333333333333, "grad_norm": 1.8889203482514176, "kl": 0.66015625, "learning_rate": 1.778326173550761e-07, "loss": 0.0335, "reward": 1.1171875447034836, "reward_std": 0.4616394564509392, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.763020858168602, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 746.7500152587891, "epoch": 0.8298666666666666, "grad_norm": 3.7971071317680223, "kl": 0.96484375, "learning_rate": 1.7595531534802315e-07, "loss": 0.1335, "reward": 1.1328125298023224, "reward_std": 0.34206103533506393, "rewards/accuracy_reward": 0.4062500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625298023224, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 735.3958435058594, "epoch": 0.832, "grad_norm": 1.7191547186183351, "kl": 1.2353515625, "learning_rate": 1.7409884179451712e-07, "loss": 0.0827, "reward": 1.0156250298023224, "reward_std": 0.463408388197422, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250149011612, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 835.9479370117188, "epoch": 0.8341333333333333, "grad_norm": 2.4409318975848224, "kl": 1.857421875, "learning_rate": 1.722633000711723e-07, "loss": 0.0711, "reward": 0.9947917014360428, "reward_std": 0.42818325012922287, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.661458358168602, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 719.0208435058594, "epoch": 0.8362666666666667, "grad_norm": 2.650545804796997, "kl": 1.470703125, "learning_rate": 1.7044879238902673e-07, "loss": 0.0996, "reward": 1.1041667014360428, "reward_std": 0.5181029364466667, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000149011612, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 674.6771087646484, "epoch": 0.8384, "grad_norm": 3.0426392653315784, "kl": 1.955078125, "learning_rate": 1.6865541978785082e-07, "loss": 0.1673, "reward": 1.091145858168602, "reward_std": 0.5123106092214584, "rewards/accuracy_reward": 0.4062500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6848958432674408, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 685.5416870117188, "epoch": 0.8405333333333334, "grad_norm": 2.2798995105984106, "kl": 1.1328125, "learning_rate": 1.6688328213052017e-07, "loss": 0.1729, "reward": 1.0963541865348816, "reward_std": 0.4155949652194977, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541716337204, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 752.3125305175781, "epoch": 0.8426666666666667, "grad_norm": 2.9684998404236205, "kl": 1.0869140625, "learning_rate": 1.6513247809745584e-07, "loss": 0.1393, "reward": 0.9869792014360428, "reward_std": 0.4611617475748062, "rewards/accuracy_reward": 0.3020833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.684895858168602, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 765.3333587646484, "epoch": 0.8448, "grad_norm": 2.267031288886419, "kl": 0.880859375, "learning_rate": 1.6340310518112837e-07, "loss": 0.0449, "reward": 1.1614583879709244, "reward_std": 0.5570357888936996, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7135416865348816, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 732.6562652587891, "epoch": 0.8469333333333333, "grad_norm": 2.968716228731711, "kl": 1.0556640625, "learning_rate": 1.6169525968062963e-07, "loss": 0.0572, "reward": 1.0911458432674408, "reward_std": 0.5004110783338547, "rewards/accuracy_reward": 0.3750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7161458432674408, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 641.0208587646484, "epoch": 0.8490666666666666, "grad_norm": 2.3709992838502982, "kl": 0.982421875, "learning_rate": 1.600090366963105e-07, "loss": 0.1097, "reward": 1.3515625298023224, "reward_std": 0.5505589917302132, "rewards/accuracy_reward": 0.5729166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7786458432674408, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 754.0521087646484, "epoch": 0.8512, "grad_norm": 3.3869434945750236, "kl": 1.7177734375, "learning_rate": 1.5834453012448454e-07, "loss": 0.1107, "reward": 1.0156250149011612, "reward_std": 0.4361722990870476, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6822916865348816, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 775.1458435058594, "epoch": 0.8533333333333334, "grad_norm": 1.7552632015249623, "kl": 1.130859375, "learning_rate": 1.5670183265220044e-07, "loss": 0.0558, "reward": 0.9921875447034836, "reward_std": 0.4147378206253052, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.700520858168602, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 695.0000305175781, "epoch": 0.8554666666666667, "grad_norm": 2.4422984120573465, "kl": 1.2822265625, "learning_rate": 1.5508103575207987e-07, "loss": 0.1304, "reward": 1.1901041865348816, "reward_std": 0.4788772165775299, "rewards/accuracy_reward": 0.4687500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541865348816, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 722.1354217529297, "epoch": 0.8576, "grad_norm": 2.3491045798533823, "kl": 0.9541015625, "learning_rate": 1.534822296772245e-07, "loss": 0.0731, "reward": 1.143229216337204, "reward_std": 0.4044281542301178, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7682291865348816, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 718.5104370117188, "epoch": 0.8597333333333333, "grad_norm": 1.4604861227005714, "kl": 0.861328125, "learning_rate": 1.519055034561902e-07, "loss": 0.1305, "reward": 1.1484375298023224, "reward_std": 0.551568478345871, "rewards/accuracy_reward": 0.4270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541716337204, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 809.4271087646484, "epoch": 0.8618666666666667, "grad_norm": 1.9573260113015454, "kl": 1.150390625, "learning_rate": 1.5035094488802919e-07, "loss": 0.0859, "reward": 1.1328125298023224, "reward_std": 0.5649653822183609, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6953125149011612, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 708.5416870117188, "epoch": 0.864, "grad_norm": 1.8307966620112655, "kl": 0.994140625, "learning_rate": 1.488186405374015e-07, "loss": 0.0849, "reward": 1.2083333730697632, "reward_std": 0.5416659787297249, "rewards/accuracy_reward": 0.479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291666716337204, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 732.9791717529297, "epoch": 0.8661333333333333, "grad_norm": 1.5317692375594139, "kl": 0.8349609375, "learning_rate": 1.4730867572975427e-07, "loss": 0.1122, "reward": 1.1432291865348816, "reward_std": 0.4479290693998337, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.747395858168602, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 673.3021087646484, "epoch": 0.8682666666666666, "grad_norm": 3.057221820656228, "kl": 1.0791015625, "learning_rate": 1.4582113454657056e-07, "loss": 0.0773, "reward": 1.2578125298023224, "reward_std": 0.5970419347286224, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625298023224, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 632.8229370117188, "epoch": 0.8704, "grad_norm": 1.750668342116461, "kl": 1.111328125, "learning_rate": 1.4435609982068764e-07, "loss": 0.119, "reward": 1.3802083432674408, "reward_std": 0.47477005422115326, "rewards/accuracy_reward": 0.5833333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7968750298023224, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 714.9479370117188, "epoch": 0.8725333333333334, "grad_norm": 2.053669308043276, "kl": 1.150390625, "learning_rate": 1.4291365313168391e-07, "loss": 0.0093, "reward": 1.1093750298023224, "reward_std": 0.4146904796361923, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 737.0416717529297, "epoch": 0.8746666666666667, "grad_norm": 2.067203499829489, "kl": 1.23046875, "learning_rate": 1.4149387480133674e-07, "loss": -0.0144, "reward": 1.0260416865348816, "reward_std": 0.46504535526037216, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750149011612, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 782.4479370117188, "epoch": 0.8768, "grad_norm": 2.201137205854952, "kl": 1.279296875, "learning_rate": 1.4009684388914954e-07, "loss": 0.0488, "reward": 1.0208333730697632, "reward_std": 0.4742833971977234, "rewards/accuracy_reward": 0.3333333386108279, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 735.1146087646484, "epoch": 0.8789333333333333, "grad_norm": 2.0850893145919263, "kl": 0.9775390625, "learning_rate": 1.3872263818794915e-07, "loss": 0.054, "reward": 1.252604216337204, "reward_std": 0.47272689640522003, "rewards/accuracy_reward": 0.5000000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7526042014360428, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 737.8958435058594, "epoch": 0.8810666666666667, "grad_norm": 3.903653333883459, "kl": 1.1669921875, "learning_rate": 1.3737133421955477e-07, "loss": 0.1438, "reward": 1.0859375596046448, "reward_std": 0.3797566667199135, "rewards/accuracy_reward": 0.3854166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208432674408, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 734.1771087646484, "epoch": 0.8832, "grad_norm": 3.256399521259395, "kl": 1.779296875, "learning_rate": 1.360430072305157e-07, "loss": 0.0931, "reward": 1.1171875298023224, "reward_std": 0.3713390752673149, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6901042014360428, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 755.0312728881836, "epoch": 0.8853333333333333, "grad_norm": 2.7713487572106423, "kl": 1.193359375, "learning_rate": 1.3473773118792247e-07, "loss": 0.0627, "reward": 1.0729166865348816, "reward_std": 0.4558168202638626, "rewards/accuracy_reward": 0.34375001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291667014360428, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 714.4479370117188, "epoch": 0.8874666666666666, "grad_norm": 1.5917831263975801, "kl": 1.1884765625, "learning_rate": 1.3345557877528736e-07, "loss": 0.0228, "reward": 1.1536458730697632, "reward_std": 0.4172417223453522, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 729.0208511352539, "epoch": 0.8896, "grad_norm": 1.8346189483153255, "kl": 0.908203125, "learning_rate": 1.3219662138849704e-07, "loss": 0.0762, "reward": 1.143229216337204, "reward_std": 0.5342837795615196, "rewards/accuracy_reward": 0.4270833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 705.4271087646484, "epoch": 0.8917333333333334, "grad_norm": 3.0405441362276697, "kl": 0.93359375, "learning_rate": 1.309609291318374e-07, "loss": 0.0913, "reward": 1.2447917461395264, "reward_std": 0.48353345692157745, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7656250149011612, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 624.6771011352539, "epoch": 0.8938666666666667, "grad_norm": 2.4031973960368482, "kl": 0.90771484375, "learning_rate": 1.2974857081408933e-07, "loss": 0.1541, "reward": 1.1718750447034836, "reward_std": 0.44468991458415985, "rewards/accuracy_reward": 0.4166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7552083432674408, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 644.5000152587891, "epoch": 0.896, "grad_norm": 2.7925764346431157, "kl": 1.61328125, "learning_rate": 1.2855961394469728e-07, "loss": 0.1007, "reward": 1.0364583730697632, "reward_std": 0.49245011806488037, "rewards/accuracy_reward": 0.3333333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7031250298023224, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 708.2604217529297, "epoch": 0.8981333333333333, "grad_norm": 1.9701217013199572, "kl": 0.52099609375, "learning_rate": 1.2739412473001038e-07, "loss": 0.0371, "reward": 1.1796875298023224, "reward_std": 0.30052849650382996, "rewards/accuracy_reward": 0.3645833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8151042014360428, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 694.3020935058594, "epoch": 0.9002666666666667, "grad_norm": 2.082069714596071, "kl": 1.2578125, "learning_rate": 1.262521680695952e-07, "loss": 0.0728, "reward": 1.0182292014360428, "reward_std": 0.3820762485265732, "rewards/accuracy_reward": 0.2812500027939677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7369791865348816, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 768.9166717529297, "epoch": 0.9024, "grad_norm": 3.3745301908072873, "kl": 1.65234375, "learning_rate": 1.251338075526224e-07, "loss": 0.045, "reward": 0.919270858168602, "reward_std": 0.40871209651231766, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375149011612, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 645.3958435058594, "epoch": 0.9045333333333333, "grad_norm": 1.622420953888201, "kl": 0.919677734375, "learning_rate": 1.240391054543255e-07, "loss": 0.0611, "reward": 1.4062500447034836, "reward_std": 0.41048867627978325, "rewards/accuracy_reward": 0.5937500298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8125000298023224, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 628.5729522705078, "epoch": 0.9066666666666666, "grad_norm": 2.6284297956508955, "kl": 1.2958984375, "learning_rate": 1.2296812273253306e-07, "loss": 0.0822, "reward": 1.299479216337204, "reward_std": 0.45082786679267883, "rewards/accuracy_reward": 0.5104166939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625149011612, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 704.5833435058594, "epoch": 0.9088, "grad_norm": 1.5533935931581717, "kl": 1.2666015625, "learning_rate": 1.2192091902427471e-07, "loss": 0.0736, "reward": 1.166666716337204, "reward_std": 0.5383878573775291, "rewards/accuracy_reward": 0.447916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 733.4687652587891, "epoch": 0.9109333333333334, "grad_norm": 5.307925548469623, "kl": 1.8876953125, "learning_rate": 1.208975526424596e-07, "loss": 0.1743, "reward": 0.9661458730697632, "reward_std": 0.5588085800409317, "rewards/accuracy_reward": 0.3020833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625149011612, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 695.4167022705078, "epoch": 0.9130666666666667, "grad_norm": 1.9477324310942372, "kl": 1.1796875, "learning_rate": 1.1989808057262999e-07, "loss": 0.1144, "reward": 1.2109375298023224, "reward_std": 0.6319162100553513, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708730697632, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 707.7708587646484, "epoch": 0.9152, "grad_norm": 1.630302912855386, "kl": 0.966796875, "learning_rate": 1.1892255846978763e-07, "loss": 0.1098, "reward": 1.2317708730697632, "reward_std": 0.5329065248370171, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7526041716337204, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 641.4687805175781, "epoch": 0.9173333333333333, "grad_norm": 1.503201933100271, "kl": 1.10888671875, "learning_rate": 1.179710406552947e-07, "loss": 0.1212, "reward": 1.2083333879709244, "reward_std": 0.5156615823507309, "rewards/accuracy_reward": 0.4895833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 724.1979217529297, "epoch": 0.9194666666666667, "grad_norm": 3.6563152259064813, "kl": 1.2880859375, "learning_rate": 1.1704358011384915e-07, "loss": 0.0636, "reward": 0.9505208432674408, "reward_std": 0.4120257571339607, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208432674408, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 657.8229370117188, "epoch": 0.9216, "grad_norm": 1.9295206416493746, "kl": 1.548828125, "learning_rate": 1.161402284905339e-07, "loss": 0.1538, "reward": 1.1171875596046448, "reward_std": 0.5346331149339676, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6901041865348816, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 693.7396240234375, "epoch": 0.9237333333333333, "grad_norm": 1.432096399760419, "kl": 1.0078125, "learning_rate": 1.1526103608794149e-07, "loss": 0.1123, "reward": 1.2187500596046448, "reward_std": 0.47654059529304504, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7291666865348816, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 778.5729370117188, "epoch": 0.9258666666666666, "grad_norm": 2.6509053732035186, "kl": 1.279296875, "learning_rate": 1.1440605186337254e-07, "loss": -0.0008, "reward": 0.8645833730697632, "reward_std": 0.4510863274335861, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 713.1250152587891, "epoch": 0.928, "grad_norm": 3.4533275842613125, "kl": 1.57421875, "learning_rate": 1.1357532342611005e-07, "loss": 0.0528, "reward": 1.1354167312383652, "reward_std": 0.490027979016304, "rewards/accuracy_reward": 0.3437500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7916666865348816, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 737.1875152587891, "epoch": 0.9301333333333334, "grad_norm": 2.0824368172370975, "kl": 1.267578125, "learning_rate": 1.1276889703476789e-07, "loss": 0.0309, "reward": 1.1197916716337204, "reward_std": 0.47420261800289154, "rewards/accuracy_reward": 0.385416679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750298023224, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 751.4791717529297, "epoch": 0.9322666666666667, "grad_norm": 3.5797973969109487, "kl": 1.08984375, "learning_rate": 1.1198681759471522e-07, "loss": 0.0705, "reward": 1.0807292014360428, "reward_std": 0.40865882486104965, "rewards/accuracy_reward": 0.3645833460614085, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.716145858168602, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 731.7291870117188, "epoch": 0.9344, "grad_norm": 2.4222477889985123, "kl": 1.0146484375, "learning_rate": 1.1122912865557577e-07, "loss": 0.0581, "reward": 1.1380208730697632, "reward_std": 0.4775843694806099, "rewards/accuracy_reward": 0.3958333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875149011612, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 694.8750152587891, "epoch": 0.9365333333333333, "grad_norm": 2.9595686017050586, "kl": 1.4482421875, "learning_rate": 1.1049587240880295e-07, "loss": 0.0858, "reward": 1.0312500596046448, "reward_std": 0.5613971948623657, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6979166865348816, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 748.0104522705078, "epoch": 0.9386666666666666, "grad_norm": 3.6804903948236647, "kl": 1.63671875, "learning_rate": 1.0978708968533028e-07, "loss": 0.049, "reward": 1.1744792014360428, "reward_std": 0.541608139872551, "rewards/accuracy_reward": 0.4375000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7369791716337204, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 821.8645935058594, "epoch": 0.9408, "grad_norm": 2.254543727913351, "kl": 1.068359375, "learning_rate": 1.0910281995329798e-07, "loss": 0.0314, "reward": 0.8046875, "reward_std": 0.37333502247929573, "rewards/accuracy_reward": 0.14583333861082792, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6588541865348816, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 673.3646087646484, "epoch": 0.9429333333333333, "grad_norm": 4.607468976961304, "kl": 0.91015625, "learning_rate": 1.0844310131585496e-07, "loss": 0.0748, "reward": 1.1744791865348816, "reward_std": 0.5283230543136597, "rewards/accuracy_reward": 0.4479166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625298023224, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 589.3229370117188, "epoch": 0.9450666666666667, "grad_norm": 1.1400175426656556, "kl": 1.2119140625, "learning_rate": 1.0780797050903712e-07, "loss": 0.0422, "reward": 1.2786459028720856, "reward_std": 0.37607645988464355, "rewards/accuracy_reward": 0.4687500149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8098958432674408, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 728.2812652587891, "epoch": 0.9472, "grad_norm": 1.9170494197283434, "kl": 1.0166015625, "learning_rate": 1.07197462899722e-07, "loss": 0.0662, "reward": 1.0625000298023224, "reward_std": 0.5107837617397308, "rewards/accuracy_reward": 0.34375000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 715.2604217529297, "epoch": 0.9493333333333334, "grad_norm": 2.873741465652177, "kl": 1.048828125, "learning_rate": 1.0661161248365888e-07, "loss": 0.1132, "reward": 1.2265625596046448, "reward_std": 0.6806151270866394, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7265625149011612, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 790.3437652587891, "epoch": 0.9514666666666667, "grad_norm": 3.6992638630578956, "kl": 0.8779296875, "learning_rate": 1.0605045188357633e-07, "loss": 0.0711, "reward": 1.0208333879709244, "reward_std": 0.5120280906558037, "rewards/accuracy_reward": 0.3020833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7187500298023224, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 767.4479370117188, "epoch": 0.9536, "grad_norm": 1.6337261755318853, "kl": 1.05078125, "learning_rate": 1.0551401234736524e-07, "loss": 0.1, "reward": 1.0104167014360428, "reward_std": 0.4127493128180504, "rewards/accuracy_reward": 0.2708333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7395833730697632, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 704.1875152587891, "epoch": 0.9557333333333333, "grad_norm": 16.258107928681625, "kl": 1.107421875, "learning_rate": 1.0500232374633883e-07, "loss": 0.1343, "reward": 1.3098959028720856, "reward_std": 0.5429221987724304, "rewards/accuracy_reward": 0.5312500223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7786458432674408, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 736.5104370117188, "epoch": 0.9578666666666666, "grad_norm": 2.394106906459762, "kl": 1.3466796875, "learning_rate": 1.0451541457356948e-07, "loss": 0.0688, "reward": 1.1380208730697632, "reward_std": 0.5113592520356178, "rewards/accuracy_reward": 0.42708334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7109375149011612, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 710.6666870117188, "epoch": 0.96, "grad_norm": 2.0483608586867628, "kl": 1.013671875, "learning_rate": 1.0405331194230196e-07, "loss": 0.1184, "reward": 1.0963542014360428, "reward_std": 0.42430291324853897, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541716337204, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 705.1458435058594, "epoch": 0.9621333333333333, "grad_norm": 1.6428274102372595, "kl": 1.072265625, "learning_rate": 1.036160415844436e-07, "loss": 0.0393, "reward": 1.2578125298023224, "reward_std": 0.5380661189556122, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7473958432674408, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 619.3437652587891, "epoch": 0.9642666666666667, "grad_norm": 1.7551241747148723, "kl": 0.943359375, "learning_rate": 1.0320362784913168e-07, "loss": 0.0358, "reward": 1.1901042014360428, "reward_std": 0.5258347168564796, "rewards/accuracy_reward": 0.4375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7526041865348816, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 757.4166870117188, "epoch": 0.9664, "grad_norm": 1.5896793238771025, "kl": 0.7626953125, "learning_rate": 1.0281609370137723e-07, "loss": 0.0252, "reward": 1.1875000298023224, "reward_std": 0.4521985128521919, "rewards/accuracy_reward": 0.4166666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.770833358168602, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 627.6041793823242, "epoch": 0.9685333333333334, "grad_norm": 3.8311761420761203, "kl": 1.84375, "learning_rate": 1.024534607207864e-07, "loss": 0.1581, "reward": 1.0677083879709244, "reward_std": 0.5132449232041836, "rewards/accuracy_reward": 0.3958333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6718750298023224, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 751.3750305175781, "epoch": 0.9706666666666667, "grad_norm": 3.41251835441772, "kl": 0.8310546875, "learning_rate": 1.0211574910035891e-07, "loss": 0.0948, "reward": 1.052083358168602, "reward_std": 0.43458399176597595, "rewards/accuracy_reward": 0.3125000102445483, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.739583358168602, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 727.3854217529297, "epoch": 0.9728, "grad_norm": 3.3487498391688364, "kl": 1.189453125, "learning_rate": 1.0180297764536348e-07, "loss": 0.0262, "reward": 1.0234375298023224, "reward_std": 0.5761856287717819, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6901041865348816, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 772.9896087646484, "epoch": 0.9749333333333333, "grad_norm": 1.4740237937952425, "kl": 1.1005859375, "learning_rate": 1.015151637722906e-07, "loss": 0.091, "reward": 0.9765625596046448, "reward_std": 0.485679030418396, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.684895858168602, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 629.4375152587891, "epoch": 0.9770666666666666, "grad_norm": 2.088255511662504, "kl": 0.8876953125, "learning_rate": 1.0125232350788295e-07, "loss": 0.0873, "reward": 1.3671875596046448, "reward_std": 0.522374838590622, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.763020858168602, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 668.4791870117188, "epoch": 0.9792, "grad_norm": 1.3331269505812111, "kl": 0.7607421875, "learning_rate": 1.0101447148824265e-07, "loss": 0.0169, "reward": 1.3880208730697632, "reward_std": 0.40413716435432434, "rewards/accuracy_reward": 0.572916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8151041865348816, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 758.3125152587891, "epoch": 0.9813333333333333, "grad_norm": 2.0621763744430535, "kl": 0.9912109375, "learning_rate": 1.0080162095801662e-07, "loss": 0.0811, "reward": 1.072916716337204, "reward_std": 0.4392261281609535, "rewards/accuracy_reward": 0.3645833432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.708333358168602, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 676.0312805175781, "epoch": 0.9834666666666667, "grad_norm": 1.8110615199199416, "kl": 1.4462890625, "learning_rate": 1.006137837696587e-07, "loss": 0.0877, "reward": 1.2109375298023224, "reward_std": 0.38446957617998123, "rewards/accuracy_reward": 0.4375000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7734375149011612, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 734.4687652587891, "epoch": 0.9856, "grad_norm": 1.726873645760791, "kl": 0.890625, "learning_rate": 1.0045097038276994e-07, "loss": 0.0598, "reward": 1.0833333730697632, "reward_std": 0.41529713198542595, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7083333432674408, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 723.5104370117188, "epoch": 0.9877333333333334, "grad_norm": 2.8491371575048827, "kl": 1.3125, "learning_rate": 1.0031318986351587e-07, "loss": 0.024, "reward": 1.0078125149011612, "reward_std": 0.46036942303180695, "rewards/accuracy_reward": 0.322916679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6848958507180214, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 635.7083435058594, "epoch": 0.9898666666666667, "grad_norm": 1.6047054865632053, "kl": 1.0810546875, "learning_rate": 1.0020044988412196e-07, "loss": 0.0451, "reward": 1.2734375298023224, "reward_std": 0.6459824442863464, "rewards/accuracy_reward": 0.4895833507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7838541865348816, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 650.1146087646484, "epoch": 0.992, "grad_norm": 4.339234215911267, "kl": 1.330078125, "learning_rate": 1.0011275672244634e-07, "loss": 0.0628, "reward": 1.1770833432674408, "reward_std": 0.4456011652946472, "rewards/accuracy_reward": 0.4166666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7604166716337204, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 693.6875152587891, "epoch": 0.9941333333333333, "grad_norm": 1.617519930861052, "kl": 1.14453125, "learning_rate": 1.0005011526162988e-07, "loss": 0.0384, "reward": 1.1250000447034836, "reward_std": 0.5143625289201736, "rewards/accuracy_reward": 0.416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7083333432674408, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 656.6145935058594, "epoch": 0.9962666666666666, "grad_norm": 3.2591696541640336, "kl": 0.826171875, "learning_rate": 1.0001252898982477e-07, "loss": 0.0027, "reward": 1.2786458730697632, "reward_std": 0.364062175154686, "rewards/accuracy_reward": 0.5000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7786458432674408, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 764.1041870117188, "epoch": 0.9984, "grad_norm": 1.6261195919283198, "kl": 1.31689453125, "learning_rate": 1e-07, "loss": 0.0665, "reward": 1.0260417014360428, "reward_std": 0.4544539228081703, "rewards/accuracy_reward": 0.3333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6927083432674408, "step": 468 }, { "epoch": 0.9984, "step": 468, "total_flos": 0.0, "train_loss": 0.08217587162211684, "train_runtime": 13418.4569, "train_samples_per_second": 0.559, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }