| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9984, | |
| "eval_steps": 500, | |
| "global_step": 468, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 894.0833435058594, | |
| "epoch": 0.0021333333333333334, | |
| "grad_norm": 0.17310986830806582, | |
| "kl": 0.0, | |
| "learning_rate": 2.127659574468085e-08, | |
| "loss": 0.0183, | |
| "reward": 0.7500000298023224, | |
| "reward_std": 0.3813292533159256, | |
| "rewards/accuracy_reward": 0.3125000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375000149011612, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 880.7187652587891, | |
| "epoch": 0.004266666666666667, | |
| "grad_norm": 0.16276198736608005, | |
| "kl": 0.0, | |
| "learning_rate": 4.25531914893617e-08, | |
| "loss": 0.0314, | |
| "reward": 0.7526041865348816, | |
| "reward_std": 0.3560462072491646, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.440104179084301, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 871.1354370117188, | |
| "epoch": 0.0064, | |
| "grad_norm": 0.1506266815932542, | |
| "kl": 2.1219253540039062e-05, | |
| "learning_rate": 6.382978723404254e-08, | |
| "loss": 0.0353, | |
| "reward": 0.6302083432674408, | |
| "reward_std": 0.3013310767710209, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4010416716337204, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 942.0937652587891, | |
| "epoch": 0.008533333333333334, | |
| "grad_norm": 0.16478813088653557, | |
| "kl": 1.8753111362457275e-05, | |
| "learning_rate": 8.51063829787234e-08, | |
| "loss": 0.0317, | |
| "reward": 0.6119791865348816, | |
| "reward_std": 0.3484013229608536, | |
| "rewards/accuracy_reward": 0.2187500111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3932291716337204, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 843.8541870117188, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 0.20469530540041506, | |
| "kl": 2.9385089874267578e-05, | |
| "learning_rate": 1.0638297872340425e-07, | |
| "loss": 0.0553, | |
| "reward": 0.7187500223517418, | |
| "reward_std": 0.3990408657118678, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833432674408, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 878.7395935058594, | |
| "epoch": 0.0128, | |
| "grad_norm": 0.14039871090608422, | |
| "kl": 1.640617847442627e-05, | |
| "learning_rate": 1.2765957446808508e-07, | |
| "loss": 0.0399, | |
| "reward": 0.6562500149011612, | |
| "reward_std": 0.29901912435889244, | |
| "rewards/accuracy_reward": 0.22916667349636555, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833507180214, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 846.1250305175781, | |
| "epoch": 0.014933333333333333, | |
| "grad_norm": 0.19580219393645013, | |
| "kl": 2.7954578399658203e-05, | |
| "learning_rate": 1.4893617021276595e-07, | |
| "loss": 0.0596, | |
| "reward": 0.6927083432674408, | |
| "reward_std": 0.318842139095068, | |
| "rewards/accuracy_reward": 0.23958333861082792, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4531250074505806, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 868.0833435058594, | |
| "epoch": 0.017066666666666667, | |
| "grad_norm": 0.1646324395539006, | |
| "kl": 3.281235694885254e-05, | |
| "learning_rate": 1.702127659574468e-07, | |
| "loss": 0.0436, | |
| "reward": 0.5729166865348816, | |
| "reward_std": 0.2931280732154846, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3854166716337204, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 885.0729370117188, | |
| "epoch": 0.0192, | |
| "grad_norm": 0.17173680857220677, | |
| "kl": 2.326071262359619e-05, | |
| "learning_rate": 1.9148936170212765e-07, | |
| "loss": 0.0606, | |
| "reward": 0.6822916865348816, | |
| "reward_std": 0.38235192000865936, | |
| "rewards/accuracy_reward": 0.2604166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4218750074505806, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 854.9166870117188, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 0.17593762669139573, | |
| "kl": 2.586841583251953e-05, | |
| "learning_rate": 2.127659574468085e-07, | |
| "loss": 0.0259, | |
| "reward": 0.6536458432674408, | |
| "reward_std": 0.29265937581658363, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.424479179084301, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.1979370117188, | |
| "epoch": 0.023466666666666667, | |
| "grad_norm": 0.181843057081716, | |
| "kl": 4.273653030395508e-05, | |
| "learning_rate": 2.3404255319148937e-07, | |
| "loss": 0.0514, | |
| "reward": 0.6145833507180214, | |
| "reward_std": 0.26475396007299423, | |
| "rewards/accuracy_reward": 0.22916667256504297, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.385416679084301, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 837.8333587646484, | |
| "epoch": 0.0256, | |
| "grad_norm": 0.20275531176806552, | |
| "kl": 2.709031105041504e-05, | |
| "learning_rate": 2.5531914893617016e-07, | |
| "loss": 0.0699, | |
| "reward": 0.7473958432674408, | |
| "reward_std": 0.3608727604150772, | |
| "rewards/accuracy_reward": 0.2916666669771075, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.455729179084301, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 766.9271087646484, | |
| "epoch": 0.027733333333333332, | |
| "grad_norm": 0.1848812468936893, | |
| "kl": 2.6345252990722656e-05, | |
| "learning_rate": 2.7659574468085106e-07, | |
| "loss": 0.0275, | |
| "reward": 0.7916666865348816, | |
| "reward_std": 0.3341744616627693, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4583333358168602, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 894.1041870117188, | |
| "epoch": 0.029866666666666666, | |
| "grad_norm": 0.1657265791258112, | |
| "kl": 2.9712915420532227e-05, | |
| "learning_rate": 2.978723404255319e-07, | |
| "loss": 0.0513, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.3291201740503311, | |
| "rewards/accuracy_reward": 0.28125001303851604, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833432674408, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 935.7708587646484, | |
| "epoch": 0.032, | |
| "grad_norm": 0.1759764964363314, | |
| "kl": 3.552436828613281e-05, | |
| "learning_rate": 3.1914893617021275e-07, | |
| "loss": 0.0254, | |
| "reward": 0.5625000149011612, | |
| "reward_std": 0.3009165897965431, | |
| "rewards/accuracy_reward": 0.1979166753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3645833432674408, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 889.5000152587891, | |
| "epoch": 0.034133333333333335, | |
| "grad_norm": 0.20036893134554598, | |
| "kl": 3.218650817871094e-05, | |
| "learning_rate": 3.404255319148936e-07, | |
| "loss": 0.0663, | |
| "reward": 0.6848958432674408, | |
| "reward_std": 0.357659300789237, | |
| "rewards/accuracy_reward": 0.2500000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4348958432674408, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 849.3958435058594, | |
| "epoch": 0.03626666666666667, | |
| "grad_norm": 0.1872151772588199, | |
| "kl": 2.2232532501220703e-05, | |
| "learning_rate": 3.617021276595745e-07, | |
| "loss": 0.0471, | |
| "reward": 0.7500000298023224, | |
| "reward_std": 0.30543046444654465, | |
| "rewards/accuracy_reward": 0.3437500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4062500074505806, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 856.1562652587891, | |
| "epoch": 0.0384, | |
| "grad_norm": 0.20516969145364147, | |
| "kl": 2.6524066925048828e-05, | |
| "learning_rate": 3.829787234042553e-07, | |
| "loss": 0.0658, | |
| "reward": 0.8046875149011612, | |
| "reward_std": 0.3947491720318794, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4296875149011612, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 930.3020935058594, | |
| "epoch": 0.04053333333333333, | |
| "grad_norm": 0.16862112578032326, | |
| "kl": 2.0489096641540527e-05, | |
| "learning_rate": 4.0425531914893614e-07, | |
| "loss": 0.0448, | |
| "reward": 0.5468750223517418, | |
| "reward_std": 0.24587241373956203, | |
| "rewards/accuracy_reward": 0.1770833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.369791679084301, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 931.0000305175781, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 0.14505532791974818, | |
| "kl": 2.5272369384765625e-05, | |
| "learning_rate": 4.25531914893617e-07, | |
| "loss": 0.0042, | |
| "reward": 0.4791666716337204, | |
| "reward_std": 0.20529046654701233, | |
| "rewards/accuracy_reward": 0.13541666883975267, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3437500074505806, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 894.1875305175781, | |
| "epoch": 0.0448, | |
| "grad_norm": 0.21590798838856656, | |
| "kl": 2.902001142501831e-05, | |
| "learning_rate": 4.4680851063829783e-07, | |
| "loss": 0.0544, | |
| "reward": 0.5390625149011612, | |
| "reward_std": 0.3045891672372818, | |
| "rewards/accuracy_reward": 0.16666667442768812, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3723958358168602, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 929.4687652587891, | |
| "epoch": 0.046933333333333334, | |
| "grad_norm": 0.1446020405000745, | |
| "kl": 2.872943878173828e-05, | |
| "learning_rate": 4.6808510638297873e-07, | |
| "loss": 0.0354, | |
| "reward": 0.5885416865348816, | |
| "reward_std": 0.24042147770524025, | |
| "rewards/accuracy_reward": 0.21875001024454832, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3697916716337204, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 951.7500152587891, | |
| "epoch": 0.04906666666666667, | |
| "grad_norm": 0.1645371600886353, | |
| "kl": 2.2001564502716064e-05, | |
| "learning_rate": 4.893617021276595e-07, | |
| "loss": 0.0339, | |
| "reward": 0.5182291865348816, | |
| "reward_std": 0.2648882642388344, | |
| "rewards/accuracy_reward": 0.14583333674818277, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3723958432674408, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 944.8750152587891, | |
| "epoch": 0.0512, | |
| "grad_norm": 0.16476425166098713, | |
| "kl": 1.712888479232788e-05, | |
| "learning_rate": 5.106382978723403e-07, | |
| "loss": 0.0178, | |
| "reward": 0.5338541865348816, | |
| "reward_std": 0.2086249254643917, | |
| "rewards/accuracy_reward": 0.1770833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3567708432674408, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 912.0521087646484, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 0.15949749914052028, | |
| "kl": 1.029670238494873e-05, | |
| "learning_rate": 5.319148936170212e-07, | |
| "loss": 0.0359, | |
| "reward": 0.5807291865348816, | |
| "reward_std": 0.23926915973424911, | |
| "rewards/accuracy_reward": 0.19791667442768812, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3828125149011612, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 948.7500152587891, | |
| "epoch": 0.055466666666666664, | |
| "grad_norm": 0.11874893433162777, | |
| "kl": 2.4452805519104004e-05, | |
| "learning_rate": 5.531914893617021e-07, | |
| "loss": 0.0124, | |
| "reward": 0.510416679084301, | |
| "reward_std": 0.18505185097455978, | |
| "rewards/accuracy_reward": 0.1770833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3333333358168602, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 802.9687805175781, | |
| "epoch": 0.0576, | |
| "grad_norm": 0.22329831650935772, | |
| "kl": 2.3111701011657715e-05, | |
| "learning_rate": 5.74468085106383e-07, | |
| "loss": 0.0836, | |
| "reward": 0.809895858168602, | |
| "reward_std": 0.4666432961821556, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4765625074505806, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 930.2187652587891, | |
| "epoch": 0.05973333333333333, | |
| "grad_norm": 0.13098037690492897, | |
| "kl": 2.3573637008666992e-05, | |
| "learning_rate": 5.957446808510638e-07, | |
| "loss": 0.0258, | |
| "reward": 0.6250000149011612, | |
| "reward_std": 0.2079470045864582, | |
| "rewards/accuracy_reward": 0.2395833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.385416679084301, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 919.5625305175781, | |
| "epoch": 0.06186666666666667, | |
| "grad_norm": 0.15705368765533165, | |
| "kl": 3.425776958465576e-05, | |
| "learning_rate": 6.170212765957446e-07, | |
| "loss": 0.0226, | |
| "reward": 0.5833333432674408, | |
| "reward_std": 0.3197858855128288, | |
| "rewards/accuracy_reward": 0.1875000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333507180214, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 838.4271087646484, | |
| "epoch": 0.064, | |
| "grad_norm": 0.20855681863309491, | |
| "kl": 3.6150217056274414e-05, | |
| "learning_rate": 6.382978723404255e-07, | |
| "loss": 0.0557, | |
| "reward": 0.6380208507180214, | |
| "reward_std": 0.3100079074501991, | |
| "rewards/accuracy_reward": 0.21875000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4192708432674408, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 862.4271087646484, | |
| "epoch": 0.06613333333333334, | |
| "grad_norm": 0.19570403843889805, | |
| "kl": 6.431341171264648e-05, | |
| "learning_rate": 6.595744680851063e-07, | |
| "loss": 0.0654, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.23266133293509483, | |
| "rewards/accuracy_reward": 0.23958334606140852, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833432674408, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 840.6041870117188, | |
| "epoch": 0.06826666666666667, | |
| "grad_norm": 0.1847871279034566, | |
| "kl": 6.045214831829071e-05, | |
| "learning_rate": 6.808510638297872e-07, | |
| "loss": 0.0659, | |
| "reward": 0.6015625223517418, | |
| "reward_std": 0.33919696137309074, | |
| "rewards/accuracy_reward": 0.19791667349636555, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4036458432674408, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 903.1250305175781, | |
| "epoch": 0.0704, | |
| "grad_norm": 0.1688618611852386, | |
| "kl": 5.7756900787353516e-05, | |
| "learning_rate": 7.021276595744681e-07, | |
| "loss": 0.0359, | |
| "reward": 0.6458333432674408, | |
| "reward_std": 0.3414671868085861, | |
| "rewards/accuracy_reward": 0.2395833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4062500074505806, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 827.0729217529297, | |
| "epoch": 0.07253333333333334, | |
| "grad_norm": 0.18227015395901028, | |
| "kl": 0.00010900944471359253, | |
| "learning_rate": 7.23404255319149e-07, | |
| "loss": 0.0124, | |
| "reward": 0.8307292014360428, | |
| "reward_std": 0.3232859745621681, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4244791716337204, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 879.5729370117188, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 0.18233962099030362, | |
| "kl": 0.00015303492546081543, | |
| "learning_rate": 7.446808510638297e-07, | |
| "loss": 0.0579, | |
| "reward": 0.6328125298023224, | |
| "reward_std": 0.20242082886397839, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.424479179084301, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 833.0000152587891, | |
| "epoch": 0.0768, | |
| "grad_norm": 0.21846869996465385, | |
| "kl": 0.00018644332885742188, | |
| "learning_rate": 7.659574468085106e-07, | |
| "loss": 0.058, | |
| "reward": 0.8697916865348816, | |
| "reward_std": 0.44036802649497986, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.463541679084301, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 884.8750305175781, | |
| "epoch": 0.07893333333333333, | |
| "grad_norm": 0.18403339762716192, | |
| "kl": 0.00013756752014160156, | |
| "learning_rate": 7.872340425531915e-07, | |
| "loss": 0.0163, | |
| "reward": 0.5911458656191826, | |
| "reward_std": 0.30511896684765816, | |
| "rewards/accuracy_reward": 0.20833334233611822, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3828125149011612, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 837.1562805175781, | |
| "epoch": 0.08106666666666666, | |
| "grad_norm": 0.19659938761910423, | |
| "kl": 0.00023549795150756836, | |
| "learning_rate": 8.085106382978723e-07, | |
| "loss": 0.0552, | |
| "reward": 0.7369792014360428, | |
| "reward_std": 0.3835315965116024, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.424479179084301, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 864.0416870117188, | |
| "epoch": 0.0832, | |
| "grad_norm": 0.17026366658290032, | |
| "kl": 0.0002574920654296875, | |
| "learning_rate": 8.297872340425532e-07, | |
| "loss": 0.042, | |
| "reward": 0.658854179084301, | |
| "reward_std": 0.22663411498069763, | |
| "rewards/accuracy_reward": 0.2604166669771075, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3984375074505806, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 862.2083435058594, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 0.15525233934854948, | |
| "kl": 0.00037288665771484375, | |
| "learning_rate": 8.51063829787234e-07, | |
| "loss": 0.0344, | |
| "reward": 0.7239583432674408, | |
| "reward_std": 0.21023957571014762, | |
| "rewards/accuracy_reward": 0.30208333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4218750149011612, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 890.6354370117188, | |
| "epoch": 0.08746666666666666, | |
| "grad_norm": 0.15378582910420188, | |
| "kl": 0.00021708011627197266, | |
| "learning_rate": 8.723404255319149e-07, | |
| "loss": 0.0307, | |
| "reward": 0.6093750149011612, | |
| "reward_std": 0.20325927063822746, | |
| "rewards/accuracy_reward": 0.21875000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250149011612, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 721.0104217529297, | |
| "epoch": 0.0896, | |
| "grad_norm": 0.25866548106788456, | |
| "kl": 0.0007886886596679688, | |
| "learning_rate": 8.936170212765957e-07, | |
| "loss": 0.0465, | |
| "reward": 0.9531250298023224, | |
| "reward_std": 0.4038851633667946, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5052083507180214, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 864.9687652587891, | |
| "epoch": 0.09173333333333333, | |
| "grad_norm": 0.1593135664938312, | |
| "kl": 0.0002856254577636719, | |
| "learning_rate": 9.148936170212766e-07, | |
| "loss": 0.0397, | |
| "reward": 0.7630208432674408, | |
| "reward_std": 0.24727017246186733, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4296875149011612, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 798.4791870117188, | |
| "epoch": 0.09386666666666667, | |
| "grad_norm": 0.3075627551417041, | |
| "kl": 0.0007753372192382812, | |
| "learning_rate": 9.361702127659575e-07, | |
| "loss": 0.0926, | |
| "reward": 0.8359375149011612, | |
| "reward_std": 0.4430364593863487, | |
| "rewards/accuracy_reward": 0.33333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5026041865348816, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 857.8437652587891, | |
| "epoch": 0.096, | |
| "grad_norm": 0.16498643701657156, | |
| "kl": 0.0007944107055664062, | |
| "learning_rate": 9.574468085106384e-07, | |
| "loss": 0.0198, | |
| "reward": 0.6614583507180214, | |
| "reward_std": 0.2478529755026102, | |
| "rewards/accuracy_reward": 0.2708333423361182, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250074505806, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 841.1562652587891, | |
| "epoch": 0.09813333333333334, | |
| "grad_norm": 0.18056566439161278, | |
| "kl": 0.00116729736328125, | |
| "learning_rate": 9.78723404255319e-07, | |
| "loss": 0.0172, | |
| "reward": 0.5911458432674408, | |
| "reward_std": 0.24517233669757843, | |
| "rewards/accuracy_reward": 0.1770833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4140625149011612, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 886.1562652587891, | |
| "epoch": 0.10026666666666667, | |
| "grad_norm": 0.1970736501400628, | |
| "kl": 0.0014438629150390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0498, | |
| "reward": 0.7135416716337204, | |
| "reward_std": 0.26464908197522163, | |
| "rewards/accuracy_reward": 0.3333333497866988, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3802083507180214, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 890.9375152587891, | |
| "epoch": 0.1024, | |
| "grad_norm": 0.1944077299260527, | |
| "kl": 0.00160980224609375, | |
| "learning_rate": 9.999874710101751e-07, | |
| "loss": 0.0525, | |
| "reward": 0.692708358168602, | |
| "reward_std": 0.35645777732133865, | |
| "rewards/accuracy_reward": 0.291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.401041679084301, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 858.7604217529297, | |
| "epoch": 0.10453333333333334, | |
| "grad_norm": 0.20748565591002802, | |
| "kl": 0.0023660659790039062, | |
| "learning_rate": 9.999498847383701e-07, | |
| "loss": 0.0501, | |
| "reward": 0.8203125149011612, | |
| "reward_std": 0.38833674043416977, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4244791716337204, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 749.6041870117188, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.225112305188479, | |
| "kl": 0.0024404525756835938, | |
| "learning_rate": 9.998872432775536e-07, | |
| "loss": 0.0557, | |
| "reward": 1.0703125447034836, | |
| "reward_std": 0.3621439263224602, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4661458358168602, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 845.7083740234375, | |
| "epoch": 0.1088, | |
| "grad_norm": 0.19286371886262563, | |
| "kl": 0.0018463134765625, | |
| "learning_rate": 9.99799550115878e-07, | |
| "loss": 0.0212, | |
| "reward": 0.7526041939854622, | |
| "reward_std": 0.3729289174079895, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3984375149011612, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 806.3854370117188, | |
| "epoch": 0.11093333333333333, | |
| "grad_norm": 0.23134579794812155, | |
| "kl": 0.0025386810302734375, | |
| "learning_rate": 9.99686810136484e-07, | |
| "loss": 0.0694, | |
| "reward": 0.817708358168602, | |
| "reward_std": 0.34440432488918304, | |
| "rewards/accuracy_reward": 0.3645833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4531250149011612, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 844.3437652587891, | |
| "epoch": 0.11306666666666666, | |
| "grad_norm": 0.17791438000124069, | |
| "kl": 0.00243377685546875, | |
| "learning_rate": 9.995490296172302e-07, | |
| "loss": 0.0348, | |
| "reward": 0.7968750298023224, | |
| "reward_std": 0.26974398642778397, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4010416865348816, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 910.5208435058594, | |
| "epoch": 0.1152, | |
| "grad_norm": 0.19770122402586285, | |
| "kl": 0.0022225379943847656, | |
| "learning_rate": 9.993862162303412e-07, | |
| "loss": 0.0511, | |
| "reward": 0.7473958432674408, | |
| "reward_std": 0.2851286958903074, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.393229179084301, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 890.2396087646484, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 0.22500934978331336, | |
| "kl": 0.0024385452270507812, | |
| "learning_rate": 9.991983790419832e-07, | |
| "loss": 0.053, | |
| "reward": 0.653645858168602, | |
| "reward_std": 0.2874385491013527, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4036458432674408, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 872.7396087646484, | |
| "epoch": 0.11946666666666667, | |
| "grad_norm": 0.18421634404374504, | |
| "kl": 0.003376007080078125, | |
| "learning_rate": 9.989855285117573e-07, | |
| "loss": 0.0606, | |
| "reward": 0.7187500298023224, | |
| "reward_std": 0.271013580262661, | |
| "rewards/accuracy_reward": 0.3229166781529784, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333432674408, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 815.7083587646484, | |
| "epoch": 0.1216, | |
| "grad_norm": 0.22992129396312608, | |
| "kl": 0.003387451171875, | |
| "learning_rate": 9.98747676492117e-07, | |
| "loss": 0.0465, | |
| "reward": 0.8151042014360428, | |
| "reward_std": 0.38903436064720154, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.440104179084301, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 790.6875152587891, | |
| "epoch": 0.12373333333333333, | |
| "grad_norm": 0.18867270309064404, | |
| "kl": 0.00368499755859375, | |
| "learning_rate": 9.984848362277092e-07, | |
| "loss": 0.0351, | |
| "reward": 0.7994791865348816, | |
| "reward_std": 0.25493843853473663, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.424479179084301, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 867.0104370117188, | |
| "epoch": 0.12586666666666665, | |
| "grad_norm": 0.21208207281989394, | |
| "kl": 0.0047149658203125, | |
| "learning_rate": 9.981970223546364e-07, | |
| "loss": 0.0405, | |
| "reward": 0.8437500223517418, | |
| "reward_std": 0.4021975174546242, | |
| "rewards/accuracy_reward": 0.4270833544433117, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.416666679084301, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.7291870117188, | |
| "epoch": 0.128, | |
| "grad_norm": 0.2211569911681807, | |
| "kl": 0.003696441650390625, | |
| "learning_rate": 9.97884250899641e-07, | |
| "loss": 0.0368, | |
| "reward": 1.0312500298023224, | |
| "reward_std": 0.21990002878010273, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4895833507180214, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 899.7708587646484, | |
| "epoch": 0.13013333333333332, | |
| "grad_norm": 0.11014958756420937, | |
| "kl": 0.00301361083984375, | |
| "learning_rate": 9.975465392792135e-07, | |
| "loss": 0.0015, | |
| "reward": 0.6744791865348816, | |
| "reward_std": 0.16699286550283432, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3515625074505806, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 913.7291870117188, | |
| "epoch": 0.13226666666666667, | |
| "grad_norm": 0.18919373882455442, | |
| "kl": 0.003414154052734375, | |
| "learning_rate": 9.971839062986228e-07, | |
| "loss": 0.0458, | |
| "reward": 0.708333358168602, | |
| "reward_std": 0.37071289867162704, | |
| "rewards/accuracy_reward": 0.3020833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4062500149011612, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 850.1354522705078, | |
| "epoch": 0.1344, | |
| "grad_norm": 0.22143875732066168, | |
| "kl": 0.00384521484375, | |
| "learning_rate": 9.967963721508683e-07, | |
| "loss": 0.0392, | |
| "reward": 0.7526041865348816, | |
| "reward_std": 0.27115987055003643, | |
| "rewards/accuracy_reward": 0.3333333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4192708432674408, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 963.1979370117188, | |
| "epoch": 0.13653333333333334, | |
| "grad_norm": 0.1550262557067119, | |
| "kl": 0.0036773681640625, | |
| "learning_rate": 9.963839584155564e-07, | |
| "loss": 0.0294, | |
| "reward": 0.5000000149011612, | |
| "reward_std": 0.238736130297184, | |
| "rewards/accuracy_reward": 0.1458333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3541666716337204, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.5833435058594, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 0.20889664359051643, | |
| "kl": 0.0074310302734375, | |
| "learning_rate": 9.95946688057698e-07, | |
| "loss": 0.0291, | |
| "reward": 0.9791666716337204, | |
| "reward_std": 0.2577291578054428, | |
| "rewards/accuracy_reward": 0.5312500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4479166716337204, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 785.0520935058594, | |
| "epoch": 0.1408, | |
| "grad_norm": 0.22320299690399084, | |
| "kl": 0.005214691162109375, | |
| "learning_rate": 9.954845854264304e-07, | |
| "loss": 0.0318, | |
| "reward": 0.8854167014360428, | |
| "reward_std": 0.38225920498371124, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833432674408, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 687.5521011352539, | |
| "epoch": 0.14293333333333333, | |
| "grad_norm": 0.26192399966053026, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 9.949976762536612e-07, | |
| "loss": 0.0332, | |
| "reward": 1.0598958432674408, | |
| "reward_std": 0.38580355048179626, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4973958507180214, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.4583435058594, | |
| "epoch": 0.14506666666666668, | |
| "grad_norm": 0.2195932504857594, | |
| "kl": 0.006011962890625, | |
| "learning_rate": 9.944859876526347e-07, | |
| "loss": 0.0482, | |
| "reward": 0.8671875223517418, | |
| "reward_std": 0.42579207941889763, | |
| "rewards/accuracy_reward": 0.3958333386108279, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4713541865348816, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 763.9895935058594, | |
| "epoch": 0.1472, | |
| "grad_norm": 0.19845257741384642, | |
| "kl": 0.006496429443359375, | |
| "learning_rate": 9.939495481164237e-07, | |
| "loss": 0.0259, | |
| "reward": 1.1302083730697632, | |
| "reward_std": 0.3065594360232353, | |
| "rewards/accuracy_reward": 0.6250000298023224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5052083432674408, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.0520935058594, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 0.22169840225751747, | |
| "kl": 0.005207061767578125, | |
| "learning_rate": 9.933883875163411e-07, | |
| "loss": 0.055, | |
| "reward": 1.057291716337204, | |
| "reward_std": 0.2972635589540005, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4947916865348816, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 880.4791870117188, | |
| "epoch": 0.15146666666666667, | |
| "grad_norm": 0.23367859778525008, | |
| "kl": 0.00757598876953125, | |
| "learning_rate": 9.928025371002782e-07, | |
| "loss": 0.067, | |
| "reward": 0.7057291865348816, | |
| "reward_std": 0.34901827573776245, | |
| "rewards/accuracy_reward": 0.28125000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4244791716337204, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 873.2604370117188, | |
| "epoch": 0.1536, | |
| "grad_norm": 0.18132372403520364, | |
| "kl": 0.00620269775390625, | |
| "learning_rate": 9.921920294909627e-07, | |
| "loss": 0.0377, | |
| "reward": 0.7057291939854622, | |
| "reward_std": 0.35817644000053406, | |
| "rewards/accuracy_reward": 0.2916666753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4140625149011612, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 738.1979370117188, | |
| "epoch": 0.15573333333333333, | |
| "grad_norm": 0.22228134275480527, | |
| "kl": 0.00677490234375, | |
| "learning_rate": 9.91556898684145e-07, | |
| "loss": 0.0601, | |
| "reward": 1.1302083879709244, | |
| "reward_std": 0.30970917269587517, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.505208358168602, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 821.6145935058594, | |
| "epoch": 0.15786666666666666, | |
| "grad_norm": 0.20389266404950881, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 9.90897180046702e-07, | |
| "loss": 0.0303, | |
| "reward": 0.934895858168602, | |
| "reward_std": 0.29574301838874817, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.466145858168602, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 802.5000305175781, | |
| "epoch": 0.16, | |
| "grad_norm": 0.2024163287984723, | |
| "kl": 0.00734710693359375, | |
| "learning_rate": 9.902129103146697e-07, | |
| "loss": 0.0486, | |
| "reward": 0.8515625149011612, | |
| "reward_std": 0.2905624881386757, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4348958432674408, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 846.8646087646484, | |
| "epoch": 0.16213333333333332, | |
| "grad_norm": 0.2311059289379649, | |
| "kl": 0.00652313232421875, | |
| "learning_rate": 9.89504127591197e-07, | |
| "loss": 0.0571, | |
| "reward": 0.7942708507180214, | |
| "reward_std": 0.30027635395526886, | |
| "rewards/accuracy_reward": 0.3854166828095913, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4088541716337204, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 878.6458587646484, | |
| "epoch": 0.16426666666666667, | |
| "grad_norm": 0.15628470133135228, | |
| "kl": 0.00507354736328125, | |
| "learning_rate": 9.887708713444242e-07, | |
| "loss": 0.0275, | |
| "reward": 0.799479179084301, | |
| "reward_std": 0.356827512383461, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4036458432674408, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 754.3541870117188, | |
| "epoch": 0.1664, | |
| "grad_norm": 0.23480771387358682, | |
| "kl": 0.00949859619140625, | |
| "learning_rate": 9.880131824052848e-07, | |
| "loss": 0.0503, | |
| "reward": 1.0078125447034836, | |
| "reward_std": 0.3376114219427109, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4661458507180214, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.8646087646484, | |
| "epoch": 0.16853333333333334, | |
| "grad_norm": 0.2549074989608758, | |
| "kl": 0.0102691650390625, | |
| "learning_rate": 9.87231102965232e-07, | |
| "loss": 0.0761, | |
| "reward": 0.755208358168602, | |
| "reward_std": 0.3330737203359604, | |
| "rewards/accuracy_reward": 0.3020833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4531250074505806, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 842.8229370117188, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 0.16239962220325158, | |
| "kl": 0.00743865966796875, | |
| "learning_rate": 9.8642467657389e-07, | |
| "loss": 0.0406, | |
| "reward": 0.7812500074505806, | |
| "reward_std": 0.24844172969460487, | |
| "rewards/accuracy_reward": 0.35416666977107525, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833432674408, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 831.6562652587891, | |
| "epoch": 0.1728, | |
| "grad_norm": 0.19487106047627312, | |
| "kl": 0.00650787353515625, | |
| "learning_rate": 9.855939481366275e-07, | |
| "loss": 0.0389, | |
| "reward": 0.7890625149011612, | |
| "reward_std": 0.27398327738046646, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4348958358168602, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 764.9062652587891, | |
| "epoch": 0.17493333333333333, | |
| "grad_norm": 0.24325209901582961, | |
| "kl": 0.008331298828125, | |
| "learning_rate": 9.847389639120585e-07, | |
| "loss": 0.0106, | |
| "reward": 1.0130208432674408, | |
| "reward_std": 0.4021530821919441, | |
| "rewards/accuracy_reward": 0.4895833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5234375149011612, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 819.7604522705078, | |
| "epoch": 0.17706666666666668, | |
| "grad_norm": 0.2209085097836123, | |
| "kl": 0.0158843994140625, | |
| "learning_rate": 9.83859771509466e-07, | |
| "loss": 0.0379, | |
| "reward": 0.7812500149011612, | |
| "reward_std": 0.33020088635385036, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833507180214, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 892.4687805175781, | |
| "epoch": 0.1792, | |
| "grad_norm": 0.1908083102962245, | |
| "kl": 0.00670623779296875, | |
| "learning_rate": 9.829564198861508e-07, | |
| "loss": 0.0521, | |
| "reward": 0.7994792014360428, | |
| "reward_std": 0.3195120617747307, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.424479179084301, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 867.65625, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 0.19210591117215867, | |
| "kl": 0.00887298583984375, | |
| "learning_rate": 9.820289593447051e-07, | |
| "loss": 0.0358, | |
| "reward": 0.8593750149011612, | |
| "reward_std": 0.313580647110939, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4531250149011612, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 825.9375152587891, | |
| "epoch": 0.18346666666666667, | |
| "grad_norm": 0.19047008974679128, | |
| "kl": 0.008056640625, | |
| "learning_rate": 9.810774415302124e-07, | |
| "loss": 0.0366, | |
| "reward": 0.7890625298023224, | |
| "reward_std": 0.30688632279634476, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4348958432674408, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 790.0312652587891, | |
| "epoch": 0.1856, | |
| "grad_norm": 0.2076366117901631, | |
| "kl": 0.0073089599609375, | |
| "learning_rate": 9.8010191942737e-07, | |
| "loss": 0.0695, | |
| "reward": 0.9557292014360428, | |
| "reward_std": 0.3128313571214676, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4765625074505806, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 839.0416717529297, | |
| "epoch": 0.18773333333333334, | |
| "grad_norm": 0.18856368038348906, | |
| "kl": 0.00917816162109375, | |
| "learning_rate": 9.791024473575404e-07, | |
| "loss": 0.058, | |
| "reward": 0.7760417014360428, | |
| "reward_std": 0.3489021761342883, | |
| "rewards/accuracy_reward": 0.3437500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.432291679084301, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.1562652587891, | |
| "epoch": 0.18986666666666666, | |
| "grad_norm": 0.2399608562140009, | |
| "kl": 0.00940704345703125, | |
| "learning_rate": 9.780790809757253e-07, | |
| "loss": 0.078, | |
| "reward": 1.046875, | |
| "reward_std": 0.3642818257212639, | |
| "rewards/accuracy_reward": 0.5520833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.494791679084301, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 688.6875152587891, | |
| "epoch": 0.192, | |
| "grad_norm": 0.2577576037099253, | |
| "kl": 0.0142364501953125, | |
| "learning_rate": 9.770318772674668e-07, | |
| "loss": 0.0516, | |
| "reward": 1.1718750447034836, | |
| "reward_std": 0.2594817951321602, | |
| "rewards/accuracy_reward": 0.6458333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.526041679084301, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 844.0416870117188, | |
| "epoch": 0.19413333333333332, | |
| "grad_norm": 0.2610335490987318, | |
| "kl": 0.0152130126953125, | |
| "learning_rate": 9.759608945456744e-07, | |
| "loss": 0.0783, | |
| "reward": 0.8567708432674408, | |
| "reward_std": 0.3492202013731003, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4609375223517418, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 721.5416870117188, | |
| "epoch": 0.19626666666666667, | |
| "grad_norm": 0.30445853959364694, | |
| "kl": 0.0128326416015625, | |
| "learning_rate": 9.748661924473775e-07, | |
| "loss": 0.0666, | |
| "reward": 1.0078125447034836, | |
| "reward_std": 0.365444540977478, | |
| "rewards/accuracy_reward": 0.4687500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5390625149011612, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.5521087646484, | |
| "epoch": 0.1984, | |
| "grad_norm": 0.26398567569252307, | |
| "kl": 0.0128021240234375, | |
| "learning_rate": 9.737478319304048e-07, | |
| "loss": 0.0602, | |
| "reward": 1.0963541865348816, | |
| "reward_std": 0.28444724529981613, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.533854179084301, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 745.0729217529297, | |
| "epoch": 0.20053333333333334, | |
| "grad_norm": 0.31783151457154274, | |
| "kl": 0.0146636962890625, | |
| "learning_rate": 9.726058752699897e-07, | |
| "loss": 0.0404, | |
| "reward": 1.0390625298023224, | |
| "reward_std": 0.3004063665866852, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5390625298023224, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 782.3646087646484, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 0.262805669568343, | |
| "kl": 0.0127410888671875, | |
| "learning_rate": 9.714403860553027e-07, | |
| "loss": 0.0752, | |
| "reward": 0.9635417014360428, | |
| "reward_std": 0.34173475950956345, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5260416865348816, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.7708587646484, | |
| "epoch": 0.2048, | |
| "grad_norm": 0.2986462792289919, | |
| "kl": 0.0128631591796875, | |
| "learning_rate": 9.702514291859108e-07, | |
| "loss": 0.0892, | |
| "reward": 1.3906250596046448, | |
| "reward_std": 0.3887404687702656, | |
| "rewards/accuracy_reward": 0.7187500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 742.9791870117188, | |
| "epoch": 0.20693333333333333, | |
| "grad_norm": 0.23266958044606426, | |
| "kl": 0.01507568359375, | |
| "learning_rate": 9.690390708681624e-07, | |
| "loss": 0.0609, | |
| "reward": 1.1848958432674408, | |
| "reward_std": 0.3821899965405464, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5390625149011612, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.4375152587891, | |
| "epoch": 0.20906666666666668, | |
| "grad_norm": 0.28427091318075753, | |
| "kl": 0.0138397216796875, | |
| "learning_rate": 9.678033786115028e-07, | |
| "loss": 0.0799, | |
| "reward": 0.9296875298023224, | |
| "reward_std": 0.39811836928129196, | |
| "rewards/accuracy_reward": 0.4270833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.502604179084301, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 690.6250152587891, | |
| "epoch": 0.2112, | |
| "grad_norm": 0.33385067368025745, | |
| "kl": 0.0172576904296875, | |
| "learning_rate": 9.665444212247126e-07, | |
| "loss": 0.1125, | |
| "reward": 1.174479216337204, | |
| "reward_std": 0.4053206965327263, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5703125149011612, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 703.5104370117188, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.26763878695230725, | |
| "kl": 0.0191497802734375, | |
| "learning_rate": 9.652622688120774e-07, | |
| "loss": 0.0934, | |
| "reward": 1.2343750596046448, | |
| "reward_std": 0.4297167584300041, | |
| "rewards/accuracy_reward": 0.5937500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6406250149011612, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 729.8542022705078, | |
| "epoch": 0.21546666666666667, | |
| "grad_norm": 0.2604446935793223, | |
| "kl": 0.01666259765625, | |
| "learning_rate": 9.639569927694842e-07, | |
| "loss": 0.0484, | |
| "reward": 1.244791716337204, | |
| "reward_std": 0.2556636780500412, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5989583507180214, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 715.8229217529297, | |
| "epoch": 0.2176, | |
| "grad_norm": 0.29543708158686033, | |
| "kl": 0.020050048828125, | |
| "learning_rate": 9.626286657804454e-07, | |
| "loss": 0.0634, | |
| "reward": 1.127604216337204, | |
| "reward_std": 0.31952518224716187, | |
| "rewards/accuracy_reward": 0.5312500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5963541865348816, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 719.2500305175781, | |
| "epoch": 0.21973333333333334, | |
| "grad_norm": 0.2824191072667187, | |
| "kl": 0.020294189453125, | |
| "learning_rate": 9.612773618120509e-07, | |
| "loss": 0.0619, | |
| "reward": 1.1067708432674408, | |
| "reward_std": 0.4568670988082886, | |
| "rewards/accuracy_reward": 0.5312500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.575520858168602, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 800.2395935058594, | |
| "epoch": 0.22186666666666666, | |
| "grad_norm": 0.29815196332100213, | |
| "kl": 0.015228271484375, | |
| "learning_rate": 9.599031561108505e-07, | |
| "loss": 0.0637, | |
| "reward": 0.8593750223517418, | |
| "reward_std": 0.3243005946278572, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.463541679084301, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 683.0625305175781, | |
| "epoch": 0.224, | |
| "grad_norm": 0.29748629948846467, | |
| "kl": 0.01812744140625, | |
| "learning_rate": 9.585061251986632e-07, | |
| "loss": 0.062, | |
| "reward": 1.263020858168602, | |
| "reward_std": 0.35825271904468536, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6588541865348816, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.7708587646484, | |
| "epoch": 0.22613333333333333, | |
| "grad_norm": 0.3282857287460221, | |
| "kl": 0.020751953125, | |
| "learning_rate": 9.57086346868316e-07, | |
| "loss": 0.1268, | |
| "reward": 1.1276041865348816, | |
| "reward_std": 0.4390442371368408, | |
| "rewards/accuracy_reward": 0.5520833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5755208432674408, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 777.3021087646484, | |
| "epoch": 0.22826666666666667, | |
| "grad_norm": 0.31736765287866603, | |
| "kl": 0.021881103515625, | |
| "learning_rate": 9.556439001793124e-07, | |
| "loss": 0.0837, | |
| "reward": 0.9401041865348816, | |
| "reward_std": 0.4035160690546036, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5338541939854622, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 826.1354370117188, | |
| "epoch": 0.2304, | |
| "grad_norm": 0.30615838328853484, | |
| "kl": 0.022369384765625, | |
| "learning_rate": 9.541788654534294e-07, | |
| "loss": 0.0902, | |
| "reward": 0.9921875447034836, | |
| "reward_std": 0.44786881655454636, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5546875149011612, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 706.3541870117188, | |
| "epoch": 0.23253333333333334, | |
| "grad_norm": 0.3044969299318315, | |
| "kl": 0.0216064453125, | |
| "learning_rate": 9.526913242702458e-07, | |
| "loss": 0.0482, | |
| "reward": 1.0520833432674408, | |
| "reward_std": 0.3498193696141243, | |
| "rewards/accuracy_reward": 0.42708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6250000149011612, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 789.9270935058594, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 0.31077022412663985, | |
| "kl": 0.0228118896484375, | |
| "learning_rate": 9.511813594625986e-07, | |
| "loss": 0.0936, | |
| "reward": 1.0364583730697632, | |
| "reward_std": 0.45415426790714264, | |
| "rewards/accuracy_reward": 0.4895833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5468750149011612, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 782.9270935058594, | |
| "epoch": 0.2368, | |
| "grad_norm": 0.33311202322553307, | |
| "kl": 0.03173828125, | |
| "learning_rate": 9.496490551119708e-07, | |
| "loss": 0.0456, | |
| "reward": 1.0807292014360428, | |
| "reward_std": 0.43023916706442833, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5598958358168602, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 797.3958587646484, | |
| "epoch": 0.23893333333333333, | |
| "grad_norm": 0.27289293925345737, | |
| "kl": 0.0220947265625, | |
| "learning_rate": 9.480944965438097e-07, | |
| "loss": 0.0822, | |
| "reward": 0.8541666865348816, | |
| "reward_std": 0.2803124524652958, | |
| "rewards/accuracy_reward": 0.3541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5000000223517418, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.5416870117188, | |
| "epoch": 0.24106666666666668, | |
| "grad_norm": 0.3606704422913038, | |
| "kl": 0.029632568359375, | |
| "learning_rate": 9.465177703227755e-07, | |
| "loss": 0.0869, | |
| "reward": 1.0677083730697632, | |
| "reward_std": 0.37738659232854843, | |
| "rewards/accuracy_reward": 0.510416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5572916865348816, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.7812805175781, | |
| "epoch": 0.2432, | |
| "grad_norm": 0.3584790769205954, | |
| "kl": 0.025115966796875, | |
| "learning_rate": 9.449189642479202e-07, | |
| "loss": 0.1074, | |
| "reward": 1.213541716337204, | |
| "reward_std": 0.4059242531657219, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5885416865348816, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 725.1146087646484, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 0.4024464636552969, | |
| "kl": 0.023956298828125, | |
| "learning_rate": 9.432981673477996e-07, | |
| "loss": 0.1087, | |
| "reward": 0.911458358168602, | |
| "reward_std": 0.3445095419883728, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5781250149011612, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 698.2292022705078, | |
| "epoch": 0.24746666666666667, | |
| "grad_norm": 0.2829082400468053, | |
| "kl": 0.023193359375, | |
| "learning_rate": 9.416554698755153e-07, | |
| "loss": 0.0251, | |
| "reward": 1.151041716337204, | |
| "reward_std": 0.3484726771712303, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416716337204, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 695.8958587646484, | |
| "epoch": 0.2496, | |
| "grad_norm": 0.4657128990625891, | |
| "kl": 0.03369140625, | |
| "learning_rate": 9.399909633036895e-07, | |
| "loss": 0.1124, | |
| "reward": 1.1406250298023224, | |
| "reward_std": 0.39415134489536285, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5781250223517418, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 601.6979370117188, | |
| "epoch": 0.2517333333333333, | |
| "grad_norm": 0.4438313112348482, | |
| "kl": 0.025787353515625, | |
| "learning_rate": 9.383047403193702e-07, | |
| "loss": 0.0863, | |
| "reward": 1.3020833730697632, | |
| "reward_std": 0.2631511315703392, | |
| "rewards/accuracy_reward": 0.635416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6666666865348816, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 712.4062652587891, | |
| "epoch": 0.2538666666666667, | |
| "grad_norm": 0.3343335198346674, | |
| "kl": 0.031707763671875, | |
| "learning_rate": 9.365968948188716e-07, | |
| "loss": 0.0373, | |
| "reward": 1.1718750447034836, | |
| "reward_std": 0.3704974502325058, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416865348816, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.2187805175781, | |
| "epoch": 0.256, | |
| "grad_norm": 0.44264007327871097, | |
| "kl": 0.03948974609375, | |
| "learning_rate": 9.348675219025442e-07, | |
| "loss": 0.065, | |
| "reward": 0.8593750149011612, | |
| "reward_std": 0.3359655812382698, | |
| "rewards/accuracy_reward": 0.34375002048909664, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5156250149011612, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 585.8541793823242, | |
| "epoch": 0.2581333333333333, | |
| "grad_norm": 0.44710975229663324, | |
| "kl": 0.041748046875, | |
| "learning_rate": 9.331167178694797e-07, | |
| "loss": 0.0672, | |
| "reward": 1.3854166865348816, | |
| "reward_std": 0.3670189455151558, | |
| "rewards/accuracy_reward": 0.645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.739583358168602, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.9270935058594, | |
| "epoch": 0.26026666666666665, | |
| "grad_norm": 0.5132333892556646, | |
| "kl": 0.03607177734375, | |
| "learning_rate": 9.313445802121493e-07, | |
| "loss": 0.0729, | |
| "reward": 1.2630209028720856, | |
| "reward_std": 0.3291833885014057, | |
| "rewards/accuracy_reward": 0.5937500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6692708432674408, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.3125152587891, | |
| "epoch": 0.2624, | |
| "grad_norm": 0.5632492443313848, | |
| "kl": 0.0504150390625, | |
| "learning_rate": 9.295512076109733e-07, | |
| "loss": 0.1126, | |
| "reward": 1.2135416865348816, | |
| "reward_std": 0.40659692883491516, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416716337204, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 708.8229370117188, | |
| "epoch": 0.26453333333333334, | |
| "grad_norm": 0.7405631059644997, | |
| "kl": 0.051025390625, | |
| "learning_rate": 9.277366999288277e-07, | |
| "loss": 0.187, | |
| "reward": 1.0755208879709244, | |
| "reward_std": 0.4825942814350128, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6588541865348816, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 683.0520935058594, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.9799980392473033, | |
| "kl": 0.05389404296875, | |
| "learning_rate": 9.259011582054829e-07, | |
| "loss": 0.1302, | |
| "reward": 0.942708358168602, | |
| "reward_std": 0.4361359179019928, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6093750149011612, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.6875152587891, | |
| "epoch": 0.2688, | |
| "grad_norm": 0.6649953700875584, | |
| "kl": 0.0589599609375, | |
| "learning_rate": 9.240446846519767e-07, | |
| "loss": 0.0771, | |
| "reward": 1.2239583730697632, | |
| "reward_std": 0.3370389975607395, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 631.2708587646484, | |
| "epoch": 0.27093333333333336, | |
| "grad_norm": 0.7556931317033608, | |
| "kl": 0.05859375, | |
| "learning_rate": 9.221673826449239e-07, | |
| "loss": 0.1097, | |
| "reward": 1.3515625298023224, | |
| "reward_std": 0.3902127370238304, | |
| "rewards/accuracy_reward": 0.6875000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6640625, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 710.3333587646484, | |
| "epoch": 0.2730666666666667, | |
| "grad_norm": 0.6862132500429674, | |
| "kl": 0.07720947265625, | |
| "learning_rate": 9.202693567207587e-07, | |
| "loss": 0.0315, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.4864039123058319, | |
| "rewards/accuracy_reward": 0.5104166939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6458333432674408, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 697.7604370117188, | |
| "epoch": 0.2752, | |
| "grad_norm": 1.1345184536394581, | |
| "kl": 0.0782470703125, | |
| "learning_rate": 9.183507125699143e-07, | |
| "loss": 0.0796, | |
| "reward": 1.1901041865348816, | |
| "reward_std": 0.46896427869796753, | |
| "rewards/accuracy_reward": 0.5416666939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375149011612, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 780.9896087646484, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 1.2758278535685441, | |
| "kl": 0.10870361328125, | |
| "learning_rate": 9.164115570309379e-07, | |
| "loss": 0.1151, | |
| "reward": 1.0130208432674408, | |
| "reward_std": 0.46661972999572754, | |
| "rewards/accuracy_reward": 0.4479166818782687, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.565104179084301, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 687.6771087646484, | |
| "epoch": 0.27946666666666664, | |
| "grad_norm": 1.0505806863206573, | |
| "kl": 0.11328125, | |
| "learning_rate": 9.144519980845404e-07, | |
| "loss": 0.0867, | |
| "reward": 1.1718750298023224, | |
| "reward_std": 0.37612347677350044, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416865348816, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 797.9479370117188, | |
| "epoch": 0.2816, | |
| "grad_norm": 1.2170240378062362, | |
| "kl": 0.14501953125, | |
| "learning_rate": 9.124721448475846e-07, | |
| "loss": 0.0487, | |
| "reward": 0.8776041865348816, | |
| "reward_std": 0.3442392908036709, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.544270858168602, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 749.1146087646484, | |
| "epoch": 0.28373333333333334, | |
| "grad_norm": 1.7369718821049942, | |
| "kl": 0.1512451171875, | |
| "learning_rate": 9.104721075670086e-07, | |
| "loss": 0.0821, | |
| "reward": 1.0468750298023224, | |
| "reward_std": 0.43739401549100876, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6093750298023224, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.3437805175781, | |
| "epoch": 0.28586666666666666, | |
| "grad_norm": 1.2882430691135234, | |
| "kl": 0.1466064453125, | |
| "learning_rate": 9.084519976136866e-07, | |
| "loss": 0.0787, | |
| "reward": 1.1380208730697632, | |
| "reward_std": 0.37952160835266113, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6380208432674408, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.0833435058594, | |
| "epoch": 0.288, | |
| "grad_norm": 0.851150504615743, | |
| "kl": 0.19677734375, | |
| "learning_rate": 9.064119274762277e-07, | |
| "loss": 0.0263, | |
| "reward": 1.1067708730697632, | |
| "reward_std": 0.40093619376420975, | |
| "rewards/accuracy_reward": 0.5000000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6067708432674408, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.4166793823242, | |
| "epoch": 0.29013333333333335, | |
| "grad_norm": 1.5391538331108987, | |
| "kl": 0.22900390625, | |
| "learning_rate": 9.043520107547121e-07, | |
| "loss": 0.0605, | |
| "reward": 1.3593750298023224, | |
| "reward_std": 0.4824457913637161, | |
| "rewards/accuracy_reward": 0.6770833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.6146087646484, | |
| "epoch": 0.2922666666666667, | |
| "grad_norm": 1.8702095195613229, | |
| "kl": 0.34130859375, | |
| "learning_rate": 9.022723621543649e-07, | |
| "loss": 0.0721, | |
| "reward": 1.2109375298023224, | |
| "reward_std": 0.5021077692508698, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375149011612, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.3646087646484, | |
| "epoch": 0.2944, | |
| "grad_norm": 2.0067460857326798, | |
| "kl": 0.27685546875, | |
| "learning_rate": 9.001730974791688e-07, | |
| "loss": 0.0303, | |
| "reward": 1.307291716337204, | |
| "reward_std": 0.37274327129125595, | |
| "rewards/accuracy_reward": 0.5937500298023224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7135416865348816, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 742.8854522705078, | |
| "epoch": 0.2965333333333333, | |
| "grad_norm": 2.727496225681536, | |
| "kl": 0.50390625, | |
| "learning_rate": 8.980543336254161e-07, | |
| "loss": 0.0822, | |
| "reward": 1.0390625298023224, | |
| "reward_std": 0.39713528752326965, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6223958432674408, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.0416870117188, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 1.6554617304448012, | |
| "kl": 0.64404296875, | |
| "learning_rate": 8.95916188575199e-07, | |
| "loss": 0.1016, | |
| "reward": 1.0364583432674408, | |
| "reward_std": 0.39632973819971085, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5885417014360428, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 703.0520935058594, | |
| "epoch": 0.3008, | |
| "grad_norm": 2.3471939112303204, | |
| "kl": 0.3349609375, | |
| "learning_rate": 8.937587813898401e-07, | |
| "loss": 0.0249, | |
| "reward": 1.1250000447034836, | |
| "reward_std": 0.4193038195371628, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6041666865348816, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.1458435058594, | |
| "epoch": 0.30293333333333333, | |
| "grad_norm": 3.554057508710156, | |
| "kl": 0.64013671875, | |
| "learning_rate": 8.915822322032628e-07, | |
| "loss": 0.0336, | |
| "reward": 1.2005208879709244, | |
| "reward_std": 0.4515683874487877, | |
| "rewards/accuracy_reward": 0.5312500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.669270858168602, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.9166870117188, | |
| "epoch": 0.30506666666666665, | |
| "grad_norm": 2.348762447932129, | |
| "kl": 0.55615234375, | |
| "learning_rate": 8.893866622153005e-07, | |
| "loss": 0.0428, | |
| "reward": 1.1328125298023224, | |
| "reward_std": 0.4083319380879402, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.653645858168602, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 702.0521087646484, | |
| "epoch": 0.3072, | |
| "grad_norm": 5.644818619662011, | |
| "kl": 0.445556640625, | |
| "learning_rate": 8.871721936849489e-07, | |
| "loss": 0.0297, | |
| "reward": 1.0390625447034836, | |
| "reward_std": 0.4139084219932556, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6640625149011612, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.5416793823242, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 8.550257693587795, | |
| "kl": 0.481689453125, | |
| "learning_rate": 8.849389499235579e-07, | |
| "loss": 0.0437, | |
| "reward": 1.3359375298023224, | |
| "reward_std": 0.4612661302089691, | |
| "rewards/accuracy_reward": 0.6354166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208730697632, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.4270935058594, | |
| "epoch": 0.31146666666666667, | |
| "grad_norm": 3.0532728962697826, | |
| "kl": 0.61328125, | |
| "learning_rate": 8.826870552879645e-07, | |
| "loss": -0.0264, | |
| "reward": 1.1015625447034836, | |
| "reward_std": 0.5034219920635223, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6536458432674408, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.8437652587891, | |
| "epoch": 0.3136, | |
| "grad_norm": 3.124971384714811, | |
| "kl": 0.408935546875, | |
| "learning_rate": 8.804166351735689e-07, | |
| "loss": -0.0965, | |
| "reward": 1.0911458879709244, | |
| "reward_std": 0.4024947062134743, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6328125149011612, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 598.2812728881836, | |
| "epoch": 0.3157333333333333, | |
| "grad_norm": 1.6524780740950624, | |
| "kl": 0.401611328125, | |
| "learning_rate": 8.781278160073508e-07, | |
| "loss": 0.0663, | |
| "reward": 1.0781250596046448, | |
| "reward_std": 0.3967272564768791, | |
| "rewards/accuracy_reward": 0.4062500111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750298023224, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 531.8958435058594, | |
| "epoch": 0.3178666666666667, | |
| "grad_norm": 2.309440251724059, | |
| "kl": 0.71435546875, | |
| "learning_rate": 8.758207252408305e-07, | |
| "loss": -0.0223, | |
| "reward": 1.2942708730697632, | |
| "reward_std": 0.46261265128850937, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7317708432674408, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 553.5312652587891, | |
| "epoch": 0.32, | |
| "grad_norm": 2.4367930100465056, | |
| "kl": 0.688232421875, | |
| "learning_rate": 8.734954913429713e-07, | |
| "loss": 0.1082, | |
| "reward": 1.2500000298023224, | |
| "reward_std": 0.41490884870290756, | |
| "rewards/accuracy_reward": 0.5937500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.65625, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.5104217529297, | |
| "epoch": 0.3221333333333333, | |
| "grad_norm": 2.8900176756676266, | |
| "kl": 0.9423828125, | |
| "learning_rate": 8.71152243793026e-07, | |
| "loss": 0.0808, | |
| "reward": 1.1380208730697632, | |
| "reward_std": 0.48239949345588684, | |
| "rewards/accuracy_reward": 0.4270833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375298023224, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 559.2396011352539, | |
| "epoch": 0.32426666666666665, | |
| "grad_norm": 2.9946090703577526, | |
| "kl": 0.69921875, | |
| "learning_rate": 8.687911130733266e-07, | |
| "loss": 0.0651, | |
| "reward": 1.377604216337204, | |
| "reward_std": 0.5387382358312607, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 600.0625305175781, | |
| "epoch": 0.3264, | |
| "grad_norm": 4.34193749706627, | |
| "kl": 1.0322265625, | |
| "learning_rate": 8.664122306620184e-07, | |
| "loss": 0.1012, | |
| "reward": 1.229166716337204, | |
| "reward_std": 0.4584341421723366, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000149011612, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 699.4583587646484, | |
| "epoch": 0.32853333333333334, | |
| "grad_norm": 2.4506518251356577, | |
| "kl": 0.83935546875, | |
| "learning_rate": 8.640157290257396e-07, | |
| "loss": 0.1104, | |
| "reward": 1.0208333730697632, | |
| "reward_std": 0.46958719938993454, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6666666865348816, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 566.9271087646484, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 5.401034584726453, | |
| "kl": 1.1689453125, | |
| "learning_rate": 8.61601741612244e-07, | |
| "loss": 0.2134, | |
| "reward": 1.2890625596046448, | |
| "reward_std": 0.44453180953860283, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6640625149011612, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 633.0104217529297, | |
| "epoch": 0.3328, | |
| "grad_norm": 2.927726674982878, | |
| "kl": 0.76513671875, | |
| "learning_rate": 8.591704028429703e-07, | |
| "loss": 0.0545, | |
| "reward": 1.1432291865348816, | |
| "reward_std": 0.4252583682537079, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7057291865348816, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 515.8021011352539, | |
| "epoch": 0.33493333333333336, | |
| "grad_norm": 7.314880509347044, | |
| "kl": 2.34375, | |
| "learning_rate": 8.567218481055575e-07, | |
| "loss": 0.2219, | |
| "reward": 1.2838541865348816, | |
| "reward_std": 0.5363652482628822, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6796875298023224, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 593.1250152587891, | |
| "epoch": 0.3370666666666667, | |
| "grad_norm": 6.2789742564910505, | |
| "kl": 1.208984375, | |
| "learning_rate": 8.542562137463047e-07, | |
| "loss": 0.0515, | |
| "reward": 1.1979166865348816, | |
| "reward_std": 0.36978180706501007, | |
| "rewards/accuracy_reward": 0.4895833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7083333432674408, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 582.3854217529297, | |
| "epoch": 0.3392, | |
| "grad_norm": 5.297033801278587, | |
| "kl": 1.0830078125, | |
| "learning_rate": 8.517736370625802e-07, | |
| "loss": 0.1485, | |
| "reward": 1.3645833730697632, | |
| "reward_std": 0.4559536874294281, | |
| "rewards/accuracy_reward": 0.6979166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6666666865348816, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.9687652587891, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 2.9904403312603587, | |
| "kl": 1.26806640625, | |
| "learning_rate": 8.492742562951751e-07, | |
| "loss": 0.1099, | |
| "reward": 1.096354216337204, | |
| "reward_std": 0.34164173156023026, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208432674408, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 631.9375305175781, | |
| "epoch": 0.34346666666666664, | |
| "grad_norm": 5.522771711959092, | |
| "kl": 1.095703125, | |
| "learning_rate": 8.467582106206057e-07, | |
| "loss": 0.1403, | |
| "reward": 1.1276042014360428, | |
| "reward_std": 0.4164763018488884, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.010416666977107525, | |
| "rewards/tag_count_reward": 0.6796875149011612, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 692.1562652587891, | |
| "epoch": 0.3456, | |
| "grad_norm": 6.080573070345314, | |
| "kl": 1.0849609375, | |
| "learning_rate": 8.44225640143364e-07, | |
| "loss": 0.0824, | |
| "reward": 1.0390625298023224, | |
| "reward_std": 0.4599665552377701, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6640625298023224, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 720.3333587646484, | |
| "epoch": 0.34773333333333334, | |
| "grad_norm": 2.824643415041674, | |
| "kl": 0.73681640625, | |
| "learning_rate": 8.416766858881155e-07, | |
| "loss": 0.043, | |
| "reward": 1.0598958730697632, | |
| "reward_std": 0.4137251526117325, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.684895858168602, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.7500305175781, | |
| "epoch": 0.34986666666666666, | |
| "grad_norm": 3.1784612712111007, | |
| "kl": 0.59130859375, | |
| "learning_rate": 8.391114897918462e-07, | |
| "loss": 0.0992, | |
| "reward": 1.3046875596046448, | |
| "reward_std": 0.3715285286307335, | |
| "rewards/accuracy_reward": 0.5937500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375149011612, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.8229370117188, | |
| "epoch": 0.352, | |
| "grad_norm": 3.0199017235674552, | |
| "kl": 1.55859375, | |
| "learning_rate": 8.3653019469596e-07, | |
| "loss": 0.1017, | |
| "reward": 1.0104166716337204, | |
| "reward_std": 0.36515790969133377, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6041666865348816, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 761.9166870117188, | |
| "epoch": 0.35413333333333336, | |
| "grad_norm": 2.6556502283004013, | |
| "kl": 0.80078125, | |
| "learning_rate": 8.339329443383233e-07, | |
| "loss": 0.0712, | |
| "reward": 1.013020858168602, | |
| "reward_std": 0.407824095338583, | |
| "rewards/accuracy_reward": 0.3645833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375298023224, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.3646087646484, | |
| "epoch": 0.3562666666666667, | |
| "grad_norm": 1.4394054723868772, | |
| "kl": 0.443359375, | |
| "learning_rate": 8.313198833452622e-07, | |
| "loss": 0.0856, | |
| "reward": 1.0781250298023224, | |
| "reward_std": 0.36224906146526337, | |
| "rewards/accuracy_reward": 0.35416668467223644, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7239583432674408, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.3854446411133, | |
| "epoch": 0.3584, | |
| "grad_norm": 3.231762913084113, | |
| "kl": 1.1796875, | |
| "learning_rate": 8.286911572235079e-07, | |
| "loss": 0.1796, | |
| "reward": 1.424479216337204, | |
| "reward_std": 0.4078570008277893, | |
| "rewards/accuracy_reward": 0.6770833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.747395858168602, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 590.2083435058594, | |
| "epoch": 0.3605333333333333, | |
| "grad_norm": 10.88252152738768, | |
| "kl": 2.607421875, | |
| "learning_rate": 8.260469123520953e-07, | |
| "loss": 0.2273, | |
| "reward": 1.3489584028720856, | |
| "reward_std": 0.46935708820819855, | |
| "rewards/accuracy_reward": 0.614583358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750298023224, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.3125152587891, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 6.442990175492329, | |
| "kl": 2.0, | |
| "learning_rate": 8.233872959742116e-07, | |
| "loss": 0.2326, | |
| "reward": 1.1302083730697632, | |
| "reward_std": 0.4670635610818863, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.0937652587891, | |
| "epoch": 0.3648, | |
| "grad_norm": 1.8353876571256793, | |
| "kl": 0.764892578125, | |
| "learning_rate": 8.207124561889967e-07, | |
| "loss": 0.0401, | |
| "reward": 1.2161458730697632, | |
| "reward_std": 0.39249349758028984, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6119791865348816, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.8021087646484, | |
| "epoch": 0.36693333333333333, | |
| "grad_norm": 28.70359041511556, | |
| "kl": 4.802001953125, | |
| "learning_rate": 8.180225419432973e-07, | |
| "loss": 0.4525, | |
| "reward": 1.3203125, | |
| "reward_std": 0.4753311946988106, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7161458432674408, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 659.8125305175781, | |
| "epoch": 0.36906666666666665, | |
| "grad_norm": 2.7371530957315326, | |
| "kl": 1.0400390625, | |
| "learning_rate": 8.15317703023372e-07, | |
| "loss": -0.0115, | |
| "reward": 1.1015625298023224, | |
| "reward_std": 0.4031752720475197, | |
| "rewards/accuracy_reward": 0.3854166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.1042022705078, | |
| "epoch": 0.3712, | |
| "grad_norm": 3.4265160026521544, | |
| "kl": 1.716796875, | |
| "learning_rate": 8.125980900465511e-07, | |
| "loss": 0.1577, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.3657406345009804, | |
| "rewards/accuracy_reward": 0.5312500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6770833432674408, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.8229370117188, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 5.3357300756416555, | |
| "kl": 0.9595947265625, | |
| "learning_rate": 8.098638544528493e-07, | |
| "loss": 0.0992, | |
| "reward": 1.3906250298023224, | |
| "reward_std": 0.4218136966228485, | |
| "rewards/accuracy_reward": 0.6250000298023224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7656250298023224, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 607.4791793823242, | |
| "epoch": 0.37546666666666667, | |
| "grad_norm": 3.058337503705944, | |
| "kl": 1.177734375, | |
| "learning_rate": 8.071151484965328e-07, | |
| "loss": 0.1292, | |
| "reward": 1.1067708730697632, | |
| "reward_std": 0.4495965465903282, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375298023224, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 651.0104370117188, | |
| "epoch": 0.3776, | |
| "grad_norm": 2.7653663091043246, | |
| "kl": 0.4844970703125, | |
| "learning_rate": 8.043521252376417e-07, | |
| "loss": 0.0987, | |
| "reward": 1.315104216337204, | |
| "reward_std": 0.3565051704645157, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7526041865348816, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.5625152587891, | |
| "epoch": 0.3797333333333333, | |
| "grad_norm": 1.4254748050396235, | |
| "kl": 0.68603515625, | |
| "learning_rate": 8.015749385334661e-07, | |
| "loss": 0.0841, | |
| "reward": 1.2630208730697632, | |
| "reward_std": 0.3636063262820244, | |
| "rewards/accuracy_reward": 0.5312500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7317708432674408, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.1979370117188, | |
| "epoch": 0.3818666666666667, | |
| "grad_norm": 4.394185818927082, | |
| "kl": 0.69482421875, | |
| "learning_rate": 7.987837430299792e-07, | |
| "loss": 0.1899, | |
| "reward": 1.2942708730697632, | |
| "reward_std": 0.401991605758667, | |
| "rewards/accuracy_reward": 0.5520833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875149011612, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 569.0208587646484, | |
| "epoch": 0.384, | |
| "grad_norm": 1.6852257830359076, | |
| "kl": 0.3607177734375, | |
| "learning_rate": 7.959786941532256e-07, | |
| "loss": -0.0204, | |
| "reward": 1.2187500298023224, | |
| "reward_std": 0.33338820189237595, | |
| "rewards/accuracy_reward": 0.4895833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291667014360428, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 675.3854522705078, | |
| "epoch": 0.38613333333333333, | |
| "grad_norm": 3.9561409906104403, | |
| "kl": 0.663818359375, | |
| "learning_rate": 7.931599481006668e-07, | |
| "loss": 0.0859, | |
| "reward": 1.143229216337204, | |
| "reward_std": 0.44593609124422073, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 580.1562805175781, | |
| "epoch": 0.38826666666666665, | |
| "grad_norm": 2.4800102088439426, | |
| "kl": 0.46826171875, | |
| "learning_rate": 7.903276618324832e-07, | |
| "loss": 0.0902, | |
| "reward": 1.2265625149011612, | |
| "reward_std": 0.4622042179107666, | |
| "rewards/accuracy_reward": 0.5000000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 571.9271011352539, | |
| "epoch": 0.3904, | |
| "grad_norm": 2.251311300102917, | |
| "kl": 0.4532470703125, | |
| "learning_rate": 7.874819930628346e-07, | |
| "loss": 0.0658, | |
| "reward": 1.221354216337204, | |
| "reward_std": 0.45174503326416016, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541865348816, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.9479370117188, | |
| "epoch": 0.39253333333333335, | |
| "grad_norm": 2.48906947440004, | |
| "kl": 0.71728515625, | |
| "learning_rate": 7.846231002510761e-07, | |
| "loss": 0.0067, | |
| "reward": 1.2473958730697632, | |
| "reward_std": 0.39163821935653687, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 665.7500152587891, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 2.233069393435031, | |
| "kl": 0.56005859375, | |
| "learning_rate": 7.817511425929367e-07, | |
| "loss": 0.0517, | |
| "reward": 1.0703125149011612, | |
| "reward_std": 0.41522736102342606, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7161458432674408, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 636.0312652587891, | |
| "epoch": 0.3968, | |
| "grad_norm": 3.3030455877601606, | |
| "kl": 0.84326171875, | |
| "learning_rate": 7.788662800116533e-07, | |
| "loss": 0.0796, | |
| "reward": 1.2890625149011612, | |
| "reward_std": 0.42196864262223244, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958432674408, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 678.4271087646484, | |
| "epoch": 0.3989333333333333, | |
| "grad_norm": 2.986964953491577, | |
| "kl": 0.4951171875, | |
| "learning_rate": 7.759686731490654e-07, | |
| "loss": 0.1355, | |
| "reward": 1.1796875596046448, | |
| "reward_std": 0.3517295569181442, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 697.5521087646484, | |
| "epoch": 0.4010666666666667, | |
| "grad_norm": 4.877935102719587, | |
| "kl": 1.8828125, | |
| "learning_rate": 7.730584833566703e-07, | |
| "loss": 0.2451, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.4034550338983536, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.645833358168602, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 546.7500305175781, | |
| "epoch": 0.4032, | |
| "grad_norm": 3.1944699506008334, | |
| "kl": 1.01123046875, | |
| "learning_rate": 7.701358726866384e-07, | |
| "loss": 0.0536, | |
| "reward": 1.2395833730697632, | |
| "reward_std": 0.333698995411396, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500298023224, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 704.6875152587891, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 3.923143130594266, | |
| "kl": 1.359375, | |
| "learning_rate": 7.672010038827887e-07, | |
| "loss": 0.1197, | |
| "reward": 1.036458358168602, | |
| "reward_std": 0.3906657174229622, | |
| "rewards/accuracy_reward": 0.3541666744276881, | |
| "rewards/format_reward": 0.010416666977107525, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 586.5833587646484, | |
| "epoch": 0.40746666666666664, | |
| "grad_norm": 3.400512631439386, | |
| "kl": 1.1552734375, | |
| "learning_rate": 7.642540403715278e-07, | |
| "loss": 0.1642, | |
| "reward": 1.3359375596046448, | |
| "reward_std": 0.4361158236861229, | |
| "rewards/accuracy_reward": 0.6250000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375149011612, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 608.7812805175781, | |
| "epoch": 0.4096, | |
| "grad_norm": 3.6993696865093293, | |
| "kl": 1.0460205078125, | |
| "learning_rate": 7.61295146252748e-07, | |
| "loss": 0.132, | |
| "reward": 1.145833358168602, | |
| "reward_std": 0.388854943215847, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.645833358168602, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.6041793823242, | |
| "epoch": 0.41173333333333334, | |
| "grad_norm": 5.308243001605844, | |
| "kl": 1.2568359375, | |
| "learning_rate": 7.583244862906906e-07, | |
| "loss": 0.2267, | |
| "reward": 1.3255208730697632, | |
| "reward_std": 0.4222230240702629, | |
| "rewards/accuracy_reward": 0.6666666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6588542014360428, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 572.1354370117188, | |
| "epoch": 0.41386666666666666, | |
| "grad_norm": 6.174139506989273, | |
| "kl": 1.595703125, | |
| "learning_rate": 7.55342225904771e-07, | |
| "loss": 0.1736, | |
| "reward": 1.2473958730697632, | |
| "reward_std": 0.4287296533584595, | |
| "rewards/accuracy_reward": 0.5937500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6536458432674408, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 681.6875152587891, | |
| "epoch": 0.416, | |
| "grad_norm": 3.555603233744486, | |
| "kl": 1.49072265625, | |
| "learning_rate": 7.523485311603671e-07, | |
| "loss": 0.1133, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.3183029256761074, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000298023224, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.3333587646484, | |
| "epoch": 0.41813333333333336, | |
| "grad_norm": 3.1970989941370918, | |
| "kl": 1.09765625, | |
| "learning_rate": 7.493435687595724e-07, | |
| "loss": 0.1391, | |
| "reward": 0.8515625298023224, | |
| "reward_std": 0.3355184718966484, | |
| "rewards/accuracy_reward": 0.2500000027939677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6015625298023224, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 667.5625152587891, | |
| "epoch": 0.4202666666666667, | |
| "grad_norm": 5.220267991345135, | |
| "kl": 1.666015625, | |
| "learning_rate": 7.463275060319126e-07, | |
| "loss": 0.2431, | |
| "reward": 1.0338542014360428, | |
| "reward_std": 0.4331643208861351, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6380208432674408, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.3229370117188, | |
| "epoch": 0.4224, | |
| "grad_norm": 3.4305060066499995, | |
| "kl": 1.2626953125, | |
| "learning_rate": 7.43300510925029e-07, | |
| "loss": 0.102, | |
| "reward": 1.2343750447034836, | |
| "reward_std": 0.40559985488653183, | |
| "rewards/accuracy_reward": 0.583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416865348816, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 695.9479370117188, | |
| "epoch": 0.4245333333333333, | |
| "grad_norm": 17.742982311990076, | |
| "kl": 2.861328125, | |
| "learning_rate": 7.40262751995325e-07, | |
| "loss": 0.209, | |
| "reward": 1.1484375596046448, | |
| "reward_std": 0.4041588194668293, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6276042014360428, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 836.7291717529297, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 6.783101729560672, | |
| "kl": 1.001953125, | |
| "learning_rate": 7.372143983985823e-07, | |
| "loss": 0.059, | |
| "reward": 0.8515625149011612, | |
| "reward_std": 0.30101747065782547, | |
| "rewards/accuracy_reward": 0.2708333348855376, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5807291865348816, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 561.3541946411133, | |
| "epoch": 0.4288, | |
| "grad_norm": 108.92153903454, | |
| "kl": 1.4921875, | |
| "learning_rate": 7.341556198805391e-07, | |
| "loss": 0.224, | |
| "reward": 1.0937500447034836, | |
| "reward_std": 0.37287120521068573, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.614583358168602, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 777.1354370117188, | |
| "epoch": 0.43093333333333333, | |
| "grad_norm": 2.9867411539858932, | |
| "kl": 0.8515625, | |
| "learning_rate": 7.310865867674396e-07, | |
| "loss": 0.1169, | |
| "reward": 0.9010417014360428, | |
| "reward_std": 0.436382420361042, | |
| "rewards/accuracy_reward": 0.2812500111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6197916865348816, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 782.5937652587891, | |
| "epoch": 0.43306666666666666, | |
| "grad_norm": 4.115346747345138, | |
| "kl": 0.95947265625, | |
| "learning_rate": 7.28007469956549e-07, | |
| "loss": 0.1098, | |
| "reward": 0.8541667014360428, | |
| "reward_std": 0.3877977281808853, | |
| "rewards/accuracy_reward": 0.2812500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166865348816, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 703.0729217529297, | |
| "epoch": 0.4352, | |
| "grad_norm": 3.6906044672073235, | |
| "kl": 1.04296875, | |
| "learning_rate": 7.249184409066367e-07, | |
| "loss": 0.0836, | |
| "reward": 1.0989583730697632, | |
| "reward_std": 0.3794466406106949, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.598958358168602, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 662.0312652587891, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 10.29645050459395, | |
| "kl": 1.0908203125, | |
| "learning_rate": 7.218196716284301e-07, | |
| "loss": 0.1656, | |
| "reward": 1.0833333730697632, | |
| "reward_std": 0.49071626365184784, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5833333432674408, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.1041946411133, | |
| "epoch": 0.43946666666666667, | |
| "grad_norm": 6.21786829985612, | |
| "kl": 0.70556640625, | |
| "learning_rate": 7.187113346750345e-07, | |
| "loss": 0.1538, | |
| "reward": 1.1692708730697632, | |
| "reward_std": 0.3329445421695709, | |
| "rewards/accuracy_reward": 0.5208333544433117, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375298023224, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 797.3020935058594, | |
| "epoch": 0.4416, | |
| "grad_norm": 7.254838875822759, | |
| "kl": 1.216796875, | |
| "learning_rate": 7.155936031323254e-07, | |
| "loss": 0.173, | |
| "reward": 0.9635416865348816, | |
| "reward_std": 0.47484801709651947, | |
| "rewards/accuracy_reward": 0.385416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5781250149011612, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.5937805175781, | |
| "epoch": 0.4437333333333333, | |
| "grad_norm": 5.539580631333173, | |
| "kl": 0.888671875, | |
| "learning_rate": 7.124666506093111e-07, | |
| "loss": 0.1573, | |
| "reward": 1.0078125298023224, | |
| "reward_std": 0.3880564123392105, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5703125149011612, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 710.0729370117188, | |
| "epoch": 0.4458666666666667, | |
| "grad_norm": 2.6124973150855926, | |
| "kl": 1.08203125, | |
| "learning_rate": 7.093306512284641e-07, | |
| "loss": 0.1311, | |
| "reward": 0.9895833730697632, | |
| "reward_std": 0.42369063943624496, | |
| "rewards/accuracy_reward": 0.3854166828095913, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6041666865348816, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 704.3125152587891, | |
| "epoch": 0.448, | |
| "grad_norm": 3.0029466909891607, | |
| "kl": 1.5205078125, | |
| "learning_rate": 7.06185779616026e-07, | |
| "loss": 0.1592, | |
| "reward": 1.1250000298023224, | |
| "reward_std": 0.3417692631483078, | |
| "rewards/accuracy_reward": 0.5312500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5937500298023224, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.8333587646484, | |
| "epoch": 0.45013333333333333, | |
| "grad_norm": 3.5405911740571083, | |
| "kl": 1.529296875, | |
| "learning_rate": 7.030322108922831e-07, | |
| "loss": 0.1676, | |
| "reward": 0.9453125298023224, | |
| "reward_std": 0.41002119332551956, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6119791716337204, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.7291870117188, | |
| "epoch": 0.45226666666666665, | |
| "grad_norm": 3.3499868360490237, | |
| "kl": 1.4091796875, | |
| "learning_rate": 6.998701206618152e-07, | |
| "loss": 0.2315, | |
| "reward": 1.145833358168602, | |
| "reward_std": 0.5038532838225365, | |
| "rewards/accuracy_reward": 0.5520833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5937500298023224, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 699.6875152587891, | |
| "epoch": 0.4544, | |
| "grad_norm": 3.8318683544447714, | |
| "kl": 1.29736328125, | |
| "learning_rate": 6.966996850037167e-07, | |
| "loss": 0.1375, | |
| "reward": 1.0937500298023224, | |
| "reward_std": 0.4506056234240532, | |
| "rewards/accuracy_reward": 0.4479166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6458333432674408, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 597.9687805175781, | |
| "epoch": 0.45653333333333335, | |
| "grad_norm": 6.442210211914711, | |
| "kl": 1.5296630859375, | |
| "learning_rate": 6.935210804617932e-07, | |
| "loss": 0.1401, | |
| "reward": 1.0234375298023224, | |
| "reward_std": 0.3827114477753639, | |
| "rewards/accuracy_reward": 0.3750000176951289, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375298023224, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 606.8541870117188, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 2.2941469936410193, | |
| "kl": 0.8770751953125, | |
| "learning_rate": 6.903344840347285e-07, | |
| "loss": 0.1111, | |
| "reward": 1.1640625447034836, | |
| "reward_std": 0.37582574412226677, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6432291865348816, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.4479217529297, | |
| "epoch": 0.4608, | |
| "grad_norm": 4.9236501743742584, | |
| "kl": 0.9921875, | |
| "learning_rate": 6.871400731662303e-07, | |
| "loss": 0.1039, | |
| "reward": 1.0494792014360428, | |
| "reward_std": 0.40211090445518494, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.653645858168602, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 742.8750152587891, | |
| "epoch": 0.4629333333333333, | |
| "grad_norm": 4.066568727046315, | |
| "kl": 1.62353515625, | |
| "learning_rate": 6.839380257351485e-07, | |
| "loss": 0.1486, | |
| "reward": 0.9739583730697632, | |
| "reward_std": 0.3369832746684551, | |
| "rewards/accuracy_reward": 0.36458334140479565, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6093750149011612, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 568.0104370117188, | |
| "epoch": 0.4650666666666667, | |
| "grad_norm": 2.4094360656304215, | |
| "kl": 1.1103515625, | |
| "learning_rate": 6.807285200455708e-07, | |
| "loss": 0.137, | |
| "reward": 1.2864584028720856, | |
| "reward_std": 0.46483898907899857, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822917014360428, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.2291717529297, | |
| "epoch": 0.4672, | |
| "grad_norm": 2.369097215742238, | |
| "kl": 0.833984375, | |
| "learning_rate": 6.775117348168934e-07, | |
| "loss": 0.1527, | |
| "reward": 1.0156250447034836, | |
| "reward_std": 0.30307888612151146, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5989583432674408, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.7708587646484, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 2.9340505919038904, | |
| "kl": 0.74609375, | |
| "learning_rate": 6.742878491738691e-07, | |
| "loss": 0.091, | |
| "reward": 1.260416716337204, | |
| "reward_std": 0.36950888112187386, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7395833432674408, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 620.0625305175781, | |
| "epoch": 0.47146666666666665, | |
| "grad_norm": 2.6901909422106742, | |
| "kl": 1.22802734375, | |
| "learning_rate": 6.710570426366329e-07, | |
| "loss": 0.0777, | |
| "reward": 1.1744792014360428, | |
| "reward_std": 0.4119907468557358, | |
| "rewards/accuracy_reward": 0.5208333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.653645858168602, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.4375, | |
| "epoch": 0.4736, | |
| "grad_norm": 3.075286484057231, | |
| "kl": 1.095703125, | |
| "learning_rate": 6.67819495110706e-07, | |
| "loss": 0.205, | |
| "reward": 1.2552083432674408, | |
| "reward_std": 0.3984260931611061, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750298023224, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 750.0000152587891, | |
| "epoch": 0.47573333333333334, | |
| "grad_norm": 1.9280597712715668, | |
| "kl": 1.0419921875, | |
| "learning_rate": 6.645753868769772e-07, | |
| "loss": 0.1367, | |
| "reward": 1.0416666865348816, | |
| "reward_std": 0.4005061313509941, | |
| "rewards/accuracy_reward": 0.3854166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6562500149011612, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 666.4583587646484, | |
| "epoch": 0.47786666666666666, | |
| "grad_norm": 1.981514239559327, | |
| "kl": 1.0439453125, | |
| "learning_rate": 6.613248985816649e-07, | |
| "loss": 0.151, | |
| "reward": 1.1770833730697632, | |
| "reward_std": 0.4760870635509491, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6562500298023224, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 719.6458587646484, | |
| "epoch": 0.48, | |
| "grad_norm": 3.0499611732687777, | |
| "kl": 1.9921875, | |
| "learning_rate": 6.580682112262565e-07, | |
| "loss": 0.1178, | |
| "reward": 1.0390625298023224, | |
| "reward_std": 0.3636975698173046, | |
| "rewards/accuracy_reward": 0.4270833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6119791716337204, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.6354370117188, | |
| "epoch": 0.48213333333333336, | |
| "grad_norm": 2.0077314384783196, | |
| "kl": 0.50634765625, | |
| "learning_rate": 6.548055061574312e-07, | |
| "loss": 0.0396, | |
| "reward": 1.0494792014360428, | |
| "reward_std": 0.354195736348629, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6328125149011612, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 584.9166870117188, | |
| "epoch": 0.4842666666666667, | |
| "grad_norm": 1.6154150634055964, | |
| "kl": 0.898193359375, | |
| "learning_rate": 6.515369650569602e-07, | |
| "loss": 0.0755, | |
| "reward": 1.2343750298023224, | |
| "reward_std": 0.3991445451974869, | |
| "rewards/accuracy_reward": 0.5312500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7031250298023224, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 730.5625152587891, | |
| "epoch": 0.4864, | |
| "grad_norm": 2.3485701929372054, | |
| "kl": 0.856689453125, | |
| "learning_rate": 6.482627699315914e-07, | |
| "loss": 0.081, | |
| "reward": 1.1406250596046448, | |
| "reward_std": 0.4424668923020363, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 659.1979370117188, | |
| "epoch": 0.4885333333333333, | |
| "grad_norm": 2.8605627209621978, | |
| "kl": 0.80029296875, | |
| "learning_rate": 6.449831031029133e-07, | |
| "loss": 0.1213, | |
| "reward": 1.114583358168602, | |
| "reward_std": 0.3903278261423111, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.677083358168602, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.1042022705078, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 3.0192923488140875, | |
| "kl": 1.078125, | |
| "learning_rate": 6.416981471972025e-07, | |
| "loss": 0.0592, | |
| "reward": 1.315104216337204, | |
| "reward_std": 0.38509828597307205, | |
| "rewards/accuracy_reward": 0.6354166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6796875149011612, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 722.8854217529297, | |
| "epoch": 0.4928, | |
| "grad_norm": 2.281989684222596, | |
| "kl": 0.880859375, | |
| "learning_rate": 6.384080851352553e-07, | |
| "loss": 0.1056, | |
| "reward": 1.0182291865348816, | |
| "reward_std": 0.3720996528863907, | |
| "rewards/accuracy_reward": 0.385416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6328125298023224, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.4062652587891, | |
| "epoch": 0.49493333333333334, | |
| "grad_norm": 2.224831725483049, | |
| "kl": 1.01953125, | |
| "learning_rate": 6.351131001222011e-07, | |
| "loss": 0.1128, | |
| "reward": 1.1406250447034836, | |
| "reward_std": 0.41287852823734283, | |
| "rewards/accuracy_reward": 0.4895833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510417014360428, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.6458435058594, | |
| "epoch": 0.49706666666666666, | |
| "grad_norm": 2.43571312728581, | |
| "kl": 0.83447265625, | |
| "learning_rate": 6.318133756373009e-07, | |
| "loss": 0.0833, | |
| "reward": 1.1145833730697632, | |
| "reward_std": 0.419599324464798, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6666667014360428, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 707.8437805175781, | |
| "epoch": 0.4992, | |
| "grad_norm": 4.855894790549249, | |
| "kl": 1.2919921875, | |
| "learning_rate": 6.285090954237299e-07, | |
| "loss": 0.0917, | |
| "reward": 0.9609375298023224, | |
| "reward_std": 0.3755484074354172, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375298023224, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 679.3021087646484, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 1.9176504636867993, | |
| "kl": 0.8720703125, | |
| "learning_rate": 6.252004434783468e-07, | |
| "loss": 0.1368, | |
| "reward": 1.0755208730697632, | |
| "reward_std": 0.38472916185855865, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.669270858168602, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 802.9062805175781, | |
| "epoch": 0.5034666666666666, | |
| "grad_norm": 6.8553458902973015, | |
| "kl": 1.1357421875, | |
| "learning_rate": 6.218876040414476e-07, | |
| "loss": 0.0643, | |
| "reward": 0.8437500447034836, | |
| "reward_std": 0.37247762084007263, | |
| "rewards/accuracy_reward": 0.23958334140479565, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6041666865348816, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 743.0000152587891, | |
| "epoch": 0.5056, | |
| "grad_norm": 2.8266738347480693, | |
| "kl": 1.078125, | |
| "learning_rate": 6.185707615865056e-07, | |
| "loss": 0.0694, | |
| "reward": 0.9713541865348816, | |
| "reward_std": 0.4353698194026947, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.638020858168602, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 689.9166870117188, | |
| "epoch": 0.5077333333333334, | |
| "grad_norm": 1.349416796393404, | |
| "kl": 0.74609375, | |
| "learning_rate": 6.152501008099008e-07, | |
| "loss": 0.0332, | |
| "reward": 1.0625000596046448, | |
| "reward_std": 0.40154092013835907, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6562500149011612, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.4062805175781, | |
| "epoch": 0.5098666666666667, | |
| "grad_norm": 2.3220730666040454, | |
| "kl": 0.6181640625, | |
| "learning_rate": 6.119258066206333e-07, | |
| "loss": 0.0478, | |
| "reward": 1.2656250596046448, | |
| "reward_std": 0.4823746606707573, | |
| "rewards/accuracy_reward": 0.5312500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750149011612, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 686.7604370117188, | |
| "epoch": 0.512, | |
| "grad_norm": 2.917475156328743, | |
| "kl": 0.8125, | |
| "learning_rate": 6.085980641300277e-07, | |
| "loss": 0.0955, | |
| "reward": 1.111979216337204, | |
| "reward_std": 0.4742467850446701, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7057291865348816, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.9166870117188, | |
| "epoch": 0.5141333333333333, | |
| "grad_norm": 3.3550027166193908, | |
| "kl": 0.671875, | |
| "learning_rate": 6.052670586414254e-07, | |
| "loss": 0.0837, | |
| "reward": 1.1380208730697632, | |
| "reward_std": 0.3583967909216881, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541865348816, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 678.4062652587891, | |
| "epoch": 0.5162666666666667, | |
| "grad_norm": 3.7580893271794977, | |
| "kl": 0.65283203125, | |
| "learning_rate": 6.01932975639866e-07, | |
| "loss": 0.1068, | |
| "reward": 1.0442708730697632, | |
| "reward_std": 0.3762342110276222, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6692708432674408, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 664.3854370117188, | |
| "epoch": 0.5184, | |
| "grad_norm": 2.727664247789793, | |
| "kl": 0.82568359375, | |
| "learning_rate": 5.985960007817583e-07, | |
| "loss": 0.1129, | |
| "reward": 1.1848958730697632, | |
| "reward_std": 0.46748675405979156, | |
| "rewards/accuracy_reward": 0.510416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6744791716337204, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.2708740234375, | |
| "epoch": 0.5205333333333333, | |
| "grad_norm": 5.516653907053587, | |
| "kl": 1.6396484375, | |
| "learning_rate": 5.952563198845426e-07, | |
| "loss": 0.1728, | |
| "reward": 0.9843750149011612, | |
| "reward_std": 0.40375181287527084, | |
| "rewards/accuracy_reward": 0.3645833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6197917014360428, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 660.2083587646484, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 3.4500320027641154, | |
| "kl": 1.484375, | |
| "learning_rate": 5.91914118916343e-07, | |
| "loss": 0.128, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.44681739807128906, | |
| "rewards/accuracy_reward": 0.4583333544433117, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979167014360428, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.4479370117188, | |
| "epoch": 0.5248, | |
| "grad_norm": 2.517559682605646, | |
| "kl": 0.7060546875, | |
| "learning_rate": 5.885695839856129e-07, | |
| "loss": 0.0819, | |
| "reward": 1.205729216337204, | |
| "reward_std": 0.4331817254424095, | |
| "rewards/accuracy_reward": 0.520833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.684895858168602, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.9166870117188, | |
| "epoch": 0.5269333333333334, | |
| "grad_norm": 2.287351931421754, | |
| "kl": 0.98193359375, | |
| "learning_rate": 5.852229013307704e-07, | |
| "loss": 0.0271, | |
| "reward": 1.0859375298023224, | |
| "reward_std": 0.32051411271095276, | |
| "rewards/accuracy_reward": 0.38541666977107525, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208432674408, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.1041793823242, | |
| "epoch": 0.5290666666666667, | |
| "grad_norm": 4.017373603712157, | |
| "kl": 1.3046875, | |
| "learning_rate": 5.818742573098282e-07, | |
| "loss": 0.0928, | |
| "reward": 1.1484375447034836, | |
| "reward_std": 0.380577452480793, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.700520858168602, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 703.0104522705078, | |
| "epoch": 0.5312, | |
| "grad_norm": 8.899673545122152, | |
| "kl": 1.818359375, | |
| "learning_rate": 5.785238383900171e-07, | |
| "loss": 0.1057, | |
| "reward": 0.9609375298023224, | |
| "reward_std": 0.3786415830254555, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6276041716337204, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 668.1562805175781, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 3.776921543714447, | |
| "kl": 1.2470703125, | |
| "learning_rate": 5.751718311374019e-07, | |
| "loss": 0.1572, | |
| "reward": 1.0911458432674408, | |
| "reward_std": 0.4785156399011612, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 692.8854370117188, | |
| "epoch": 0.5354666666666666, | |
| "grad_norm": 5.561615692358444, | |
| "kl": 0.7373046875, | |
| "learning_rate": 5.718184222064923e-07, | |
| "loss": 0.0939, | |
| "reward": 1.1145833730697632, | |
| "reward_std": 0.3925316818058491, | |
| "rewards/accuracy_reward": 0.3541666753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7604166865348816, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 726.2916717529297, | |
| "epoch": 0.5376, | |
| "grad_norm": 2.396459741540722, | |
| "kl": 1.03515625, | |
| "learning_rate": 5.684637983298504e-07, | |
| "loss": 0.0482, | |
| "reward": 1.1484375149011612, | |
| "reward_std": 0.47722647339105606, | |
| "rewards/accuracy_reward": 0.4375000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375149011612, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 662.1146011352539, | |
| "epoch": 0.5397333333333333, | |
| "grad_norm": 1.3794330641432073, | |
| "kl": 0.79931640625, | |
| "learning_rate": 5.65108146307691e-07, | |
| "loss": 0.1053, | |
| "reward": 1.3203125298023224, | |
| "reward_std": 0.37360265105962753, | |
| "rewards/accuracy_reward": 0.5312500111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7890625149011612, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 665.2812652587891, | |
| "epoch": 0.5418666666666667, | |
| "grad_norm": 4.3373374964971285, | |
| "kl": 0.6513671875, | |
| "learning_rate": 5.617516529974812e-07, | |
| "loss": 0.0475, | |
| "reward": 1.1640625298023224, | |
| "reward_std": 0.42270269989967346, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 659.1562805175781, | |
| "epoch": 0.544, | |
| "grad_norm": 2.857500389379376, | |
| "kl": 0.66552734375, | |
| "learning_rate": 5.583945053035345e-07, | |
| "loss": 0.0583, | |
| "reward": 1.260416716337204, | |
| "reward_std": 0.36464181914925575, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7500000298023224, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 623.3854522705078, | |
| "epoch": 0.5461333333333334, | |
| "grad_norm": 2.195937822750307, | |
| "kl": 0.9833984375, | |
| "learning_rate": 5.550368901666031e-07, | |
| "loss": 0.0581, | |
| "reward": 1.2864583730697632, | |
| "reward_std": 0.5202609151601791, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7447916865348816, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.0833435058594, | |
| "epoch": 0.5482666666666667, | |
| "grad_norm": 3.654130355196583, | |
| "kl": 0.900634765625, | |
| "learning_rate": 5.516789945534687e-07, | |
| "loss": 0.0686, | |
| "reward": 1.0442708730697632, | |
| "reward_std": 0.3605649992823601, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375149011612, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.7187576293945, | |
| "epoch": 0.5504, | |
| "grad_norm": 3.154289332558006, | |
| "kl": 0.73095703125, | |
| "learning_rate": 5.483210054465313e-07, | |
| "loss": 0.0627, | |
| "reward": 1.166666716337204, | |
| "reward_std": 0.41931793093681335, | |
| "rewards/accuracy_reward": 0.38541666977107525, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7812500149011612, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 653.3125152587891, | |
| "epoch": 0.5525333333333333, | |
| "grad_norm": 5.759985243075577, | |
| "kl": 2.16796875, | |
| "learning_rate": 5.44963109833397e-07, | |
| "loss": 0.1535, | |
| "reward": 1.1901042014360428, | |
| "reward_std": 0.42829064279794693, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875149011612, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 740.5312652587891, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 1.9810573316642937, | |
| "kl": 1.783203125, | |
| "learning_rate": 5.416054946964657e-07, | |
| "loss": 0.105, | |
| "reward": 1.1250000298023224, | |
| "reward_std": 0.4422023892402649, | |
| "rewards/accuracy_reward": 0.4062500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500149011612, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.4687652587891, | |
| "epoch": 0.5568, | |
| "grad_norm": 1.7507988717457068, | |
| "kl": 0.9384765625, | |
| "learning_rate": 5.382483470025188e-07, | |
| "loss": 0.0749, | |
| "reward": 1.0312500447034836, | |
| "reward_std": 0.3682085946202278, | |
| "rewards/accuracy_reward": 0.28125000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7500000298023224, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 667.3437805175781, | |
| "epoch": 0.5589333333333333, | |
| "grad_norm": 1.4363727179496517, | |
| "kl": 0.61376953125, | |
| "learning_rate": 5.34891853692309e-07, | |
| "loss": 0.1173, | |
| "reward": 1.3385416865348816, | |
| "reward_std": 0.32814982905983925, | |
| "rewards/accuracy_reward": 0.5729166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7656250298023224, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 688.8958587646484, | |
| "epoch": 0.5610666666666667, | |
| "grad_norm": 2.577013694965925, | |
| "kl": 1.4384765625, | |
| "learning_rate": 5.315362016701495e-07, | |
| "loss": 0.0793, | |
| "reward": 1.2578125596046448, | |
| "reward_std": 0.47568681836128235, | |
| "rewards/accuracy_reward": 0.5312500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 574.2291717529297, | |
| "epoch": 0.5632, | |
| "grad_norm": 2.308481592871747, | |
| "kl": 0.88916015625, | |
| "learning_rate": 5.281815777935076e-07, | |
| "loss": -0.0186, | |
| "reward": 1.4843750596046448, | |
| "reward_std": 0.42332855612039566, | |
| "rewards/accuracy_reward": 0.6979167014360428, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.786458358168602, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 738.8229370117188, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 3.6223448115920025, | |
| "kl": 1.568115234375, | |
| "learning_rate": 5.248281688625984e-07, | |
| "loss": 0.1697, | |
| "reward": 1.075520858168602, | |
| "reward_std": 0.3689369484782219, | |
| "rewards/accuracy_reward": 0.3437500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 725.0104370117188, | |
| "epoch": 0.5674666666666667, | |
| "grad_norm": 2.8406366622019723, | |
| "kl": 0.662109375, | |
| "learning_rate": 5.21476161609983e-07, | |
| "loss": 0.0583, | |
| "reward": 1.2812500596046448, | |
| "reward_std": 0.3292672336101532, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.770833358168602, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 604.2500152587891, | |
| "epoch": 0.5696, | |
| "grad_norm": 3.8506726775482414, | |
| "kl": 1.0810546875, | |
| "learning_rate": 5.181257426901719e-07, | |
| "loss": 0.0963, | |
| "reward": 1.385416716337204, | |
| "reward_std": 0.48288238793611526, | |
| "rewards/accuracy_reward": 0.6354166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7500000149011612, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 622.3541870117188, | |
| "epoch": 0.5717333333333333, | |
| "grad_norm": 5.330990026360423, | |
| "kl": 1.537109375, | |
| "learning_rate": 5.147770986692298e-07, | |
| "loss": 0.1471, | |
| "reward": 1.1484375149011612, | |
| "reward_std": 0.3771478980779648, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 614.8333435058594, | |
| "epoch": 0.5738666666666666, | |
| "grad_norm": 5.401106666341018, | |
| "kl": 1.873046875, | |
| "learning_rate": 5.114304160143872e-07, | |
| "loss": 0.0376, | |
| "reward": 1.1484375149011612, | |
| "reward_std": 0.40641965717077255, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875298023224, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 681.125, | |
| "epoch": 0.576, | |
| "grad_norm": 2.132821179572739, | |
| "kl": 1.201171875, | |
| "learning_rate": 5.080858810836569e-07, | |
| "loss": 0.1105, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.35508203506469727, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 790.9062805175781, | |
| "epoch": 0.5781333333333334, | |
| "grad_norm": 7.649321265767852, | |
| "kl": 2.1328125, | |
| "learning_rate": 5.047436801154574e-07, | |
| "loss": 0.0877, | |
| "reward": 0.8098958432674408, | |
| "reward_std": 0.3416658490896225, | |
| "rewards/accuracy_reward": 0.1770833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6328125149011612, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 744.7916870117188, | |
| "epoch": 0.5802666666666667, | |
| "grad_norm": 85.28665013162654, | |
| "kl": 5.845703125, | |
| "learning_rate": 5.014039992182416e-07, | |
| "loss": 0.3261, | |
| "reward": 1.221354216337204, | |
| "reward_std": 0.4842746704816818, | |
| "rewards/accuracy_reward": 0.4895833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 732.8020935058594, | |
| "epoch": 0.5824, | |
| "grad_norm": 3.0599918049429764, | |
| "kl": 1.1376953125, | |
| "learning_rate": 4.98067024360134e-07, | |
| "loss": 0.0921, | |
| "reward": 1.1354167014360428, | |
| "reward_std": 0.4286448433995247, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 774.8541870117188, | |
| "epoch": 0.5845333333333333, | |
| "grad_norm": 3.424931636155576, | |
| "kl": 1.0224609375, | |
| "learning_rate": 4.947329413585745e-07, | |
| "loss": 0.0897, | |
| "reward": 1.1328125298023224, | |
| "reward_std": 0.4221769720315933, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 776.0833587646484, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 2.225091632923016, | |
| "kl": 1.328125, | |
| "learning_rate": 4.914019358699724e-07, | |
| "loss": 0.0384, | |
| "reward": 1.098958358168602, | |
| "reward_std": 0.3936329632997513, | |
| "rewards/accuracy_reward": 0.3437500037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7552083432674408, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 789.9687805175781, | |
| "epoch": 0.5888, | |
| "grad_norm": 2.1457006633806324, | |
| "kl": 1.3408203125, | |
| "learning_rate": 4.880741933793668e-07, | |
| "loss": 0.1124, | |
| "reward": 0.8880208730697632, | |
| "reward_std": 0.44243620336055756, | |
| "rewards/accuracy_reward": 0.2604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6276042014360428, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 611.0104370117188, | |
| "epoch": 0.5909333333333333, | |
| "grad_norm": 1.9977027582605016, | |
| "kl": 0.7919921875, | |
| "learning_rate": 4.847498991900991e-07, | |
| "loss": 0.145, | |
| "reward": 1.2786458730697632, | |
| "reward_std": 0.5421346426010132, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7369791716337204, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 649.6354522705078, | |
| "epoch": 0.5930666666666666, | |
| "grad_norm": 2.205181301769632, | |
| "kl": 0.869140625, | |
| "learning_rate": 4.814292384134943e-07, | |
| "loss": 0.0844, | |
| "reward": 1.190104216337204, | |
| "reward_std": 0.5330179780721664, | |
| "rewards/accuracy_reward": 0.4895833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.700520858168602, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 725.1875152587891, | |
| "epoch": 0.5952, | |
| "grad_norm": 3.808352664658485, | |
| "kl": 1.3873291015625, | |
| "learning_rate": 4.781123959585526e-07, | |
| "loss": 0.0791, | |
| "reward": 1.0364583730697632, | |
| "reward_std": 0.43167005479335785, | |
| "rewards/accuracy_reward": 0.3645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 746.4479370117188, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 4.134001817100567, | |
| "kl": 1.68798828125, | |
| "learning_rate": 4.7479955652165315e-07, | |
| "loss": 0.1172, | |
| "reward": 1.1901041865348816, | |
| "reward_std": 0.3821878246963024, | |
| "rewards/accuracy_reward": 0.4687500260770321, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213542014360428, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.4375152587891, | |
| "epoch": 0.5994666666666667, | |
| "grad_norm": 2.24511147535919, | |
| "kl": 0.7109375, | |
| "learning_rate": 4.714909045762702e-07, | |
| "loss": 0.0876, | |
| "reward": 1.104166716337204, | |
| "reward_std": 0.4187440648674965, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.1562652587891, | |
| "epoch": 0.6016, | |
| "grad_norm": 2.122169433416534, | |
| "kl": 0.940185546875, | |
| "learning_rate": 4.681866243626992e-07, | |
| "loss": 0.1236, | |
| "reward": 1.2083334028720856, | |
| "reward_std": 0.46300840377807617, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 690.8333435058594, | |
| "epoch": 0.6037333333333333, | |
| "grad_norm": 4.58226981588225, | |
| "kl": 1.330078125, | |
| "learning_rate": 4.6488689987779893e-07, | |
| "loss": 0.0949, | |
| "reward": 1.299479216337204, | |
| "reward_std": 0.5550074055790901, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7578125298023224, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.6041870117188, | |
| "epoch": 0.6058666666666667, | |
| "grad_norm": 2.0045221335656356, | |
| "kl": 0.68896484375, | |
| "learning_rate": 4.615919148647448e-07, | |
| "loss": 0.0382, | |
| "reward": 1.2916666865348816, | |
| "reward_std": 0.3872656598687172, | |
| "rewards/accuracy_reward": 0.5312500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7604166865348816, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.6562652587891, | |
| "epoch": 0.608, | |
| "grad_norm": 2.388321430701261, | |
| "kl": 0.82421875, | |
| "learning_rate": 4.583018528027975e-07, | |
| "loss": 0.0618, | |
| "reward": 1.2109375298023224, | |
| "reward_std": 0.386493518948555, | |
| "rewards/accuracy_reward": 0.4479166939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.763020858168602, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 654.4687805175781, | |
| "epoch": 0.6101333333333333, | |
| "grad_norm": 1.5558415219485469, | |
| "kl": 0.814453125, | |
| "learning_rate": 4.550168968970869e-07, | |
| "loss": 0.0904, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.35980185866355896, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000149011612, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.6354370117188, | |
| "epoch": 0.6122666666666666, | |
| "grad_norm": 2.291619988144414, | |
| "kl": 0.670166015625, | |
| "learning_rate": 4.5173723006840856e-07, | |
| "loss": 0.0508, | |
| "reward": 1.0911458730697632, | |
| "reward_std": 0.34144046902656555, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 736.4062805175781, | |
| "epoch": 0.6144, | |
| "grad_norm": 1.9834242115432024, | |
| "kl": 0.6470947265625, | |
| "learning_rate": 4.484630349430397e-07, | |
| "loss": 0.0779, | |
| "reward": 1.1953125298023224, | |
| "reward_std": 0.46002406626939774, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958730697632, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 789.8021087646484, | |
| "epoch": 0.6165333333333334, | |
| "grad_norm": 2.3855167211416366, | |
| "kl": 0.6494140625, | |
| "learning_rate": 4.451944938425689e-07, | |
| "loss": 0.0549, | |
| "reward": 1.0781250447034836, | |
| "reward_std": 0.37145940214395523, | |
| "rewards/accuracy_reward": 0.3541666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7239583432674408, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.1041870117188, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 1.918889621315985, | |
| "kl": 0.78369140625, | |
| "learning_rate": 4.419317887737434e-07, | |
| "loss": 0.0668, | |
| "reward": 1.3151041865348816, | |
| "reward_std": 0.45681414008140564, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7734375149011612, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.1562805175781, | |
| "epoch": 0.6208, | |
| "grad_norm": 4.355501770676797, | |
| "kl": 0.7783203125, | |
| "learning_rate": 4.386751014183351e-07, | |
| "loss": 0.1746, | |
| "reward": 1.3515625298023224, | |
| "reward_std": 0.42713654786348343, | |
| "rewards/accuracy_reward": 0.5625000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7890625149011612, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 596.4270935058594, | |
| "epoch": 0.6229333333333333, | |
| "grad_norm": 2.342168905969303, | |
| "kl": 1.732421875, | |
| "learning_rate": 4.354246131230226e-07, | |
| "loss": 0.2066, | |
| "reward": 1.2239583730697632, | |
| "reward_std": 0.5141923800110817, | |
| "rewards/accuracy_reward": 0.4687500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7552083432674408, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.4270935058594, | |
| "epoch": 0.6250666666666667, | |
| "grad_norm": 3.4490712051569914, | |
| "kl": 1.337890625, | |
| "learning_rate": 4.3218050488929415e-07, | |
| "loss": 0.0777, | |
| "reward": 1.0494791865348816, | |
| "reward_std": 0.36260855570435524, | |
| "rewards/accuracy_reward": 0.35416667722165585, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.5625152587891, | |
| "epoch": 0.6272, | |
| "grad_norm": 8.813117801633163, | |
| "kl": 2.291015625, | |
| "learning_rate": 4.289429573633672e-07, | |
| "loss": 0.1296, | |
| "reward": 1.1614583730697632, | |
| "reward_std": 0.48469968885183334, | |
| "rewards/accuracy_reward": 0.4270833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750149011612, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 765.7812805175781, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 2.829484028197823, | |
| "kl": 1.390625, | |
| "learning_rate": 4.257121508261311e-07, | |
| "loss": 0.0696, | |
| "reward": 0.911458358168602, | |
| "reward_std": 0.34382112324237823, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822917014360428, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 688.0000152587891, | |
| "epoch": 0.6314666666666666, | |
| "grad_norm": 3.8216063069299366, | |
| "kl": 1.1865234375, | |
| "learning_rate": 4.2248826518310663e-07, | |
| "loss": 0.0771, | |
| "reward": 1.1796875447034836, | |
| "reward_std": 0.5371479094028473, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 676.5937652587891, | |
| "epoch": 0.6336, | |
| "grad_norm": 7.06830505996, | |
| "kl": 2.5546875, | |
| "learning_rate": 4.1927147995442925e-07, | |
| "loss": 0.1405, | |
| "reward": 1.221354216337204, | |
| "reward_std": 0.4441913291811943, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541865348816, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 648.8020935058594, | |
| "epoch": 0.6357333333333334, | |
| "grad_norm": 3.04039676189216, | |
| "kl": 1.435546875, | |
| "learning_rate": 4.160619742648517e-07, | |
| "loss": 0.137, | |
| "reward": 1.2421875596046448, | |
| "reward_std": 0.4874294400215149, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7734375149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 711.3541793823242, | |
| "epoch": 0.6378666666666667, | |
| "grad_norm": 2.8444606747983294, | |
| "kl": 1.380859375, | |
| "learning_rate": 4.128599268337699e-07, | |
| "loss": 0.1405, | |
| "reward": 1.1197917014360428, | |
| "reward_std": 0.5157921761274338, | |
| "rewards/accuracy_reward": 0.4270833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6927083432674408, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 739.6354370117188, | |
| "epoch": 0.64, | |
| "grad_norm": 7.001915278918098, | |
| "kl": 1.484375, | |
| "learning_rate": 4.096655159652717e-07, | |
| "loss": 0.1311, | |
| "reward": 1.1901041865348816, | |
| "reward_std": 0.4807375743985176, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7317708432674408, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 676.8333435058594, | |
| "epoch": 0.6421333333333333, | |
| "grad_norm": 2.1230958459406404, | |
| "kl": 1.44921875, | |
| "learning_rate": 4.0647891953820677e-07, | |
| "loss": 0.0895, | |
| "reward": 1.1640625149011612, | |
| "reward_std": 0.46265844255685806, | |
| "rewards/accuracy_reward": 0.4270833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7369791865348816, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.0104370117188, | |
| "epoch": 0.6442666666666667, | |
| "grad_norm": 5.592858764517026, | |
| "kl": 1.2412109375, | |
| "learning_rate": 4.0330031499628327e-07, | |
| "loss": 0.0744, | |
| "reward": 1.1666667014360428, | |
| "reward_std": 0.4661066606640816, | |
| "rewards/accuracy_reward": 0.4479166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500149011612, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 615.4583435058594, | |
| "epoch": 0.6464, | |
| "grad_norm": 2.9391531442419336, | |
| "kl": 1.37890625, | |
| "learning_rate": 4.00129879338185e-07, | |
| "loss": 0.051, | |
| "reward": 1.3697917461395264, | |
| "reward_std": 0.40326759219169617, | |
| "rewards/accuracy_reward": 0.5729166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7968750298023224, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 675.8333435058594, | |
| "epoch": 0.6485333333333333, | |
| "grad_norm": 7.164162181480773, | |
| "kl": 2.1025390625, | |
| "learning_rate": 3.969677891077169e-07, | |
| "loss": 0.1793, | |
| "reward": 1.0364583730697632, | |
| "reward_std": 0.436903640627861, | |
| "rewards/accuracy_reward": 0.3437500111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.692708358168602, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 654.8021087646484, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 3.704485718269989, | |
| "kl": 1.09619140625, | |
| "learning_rate": 3.938142203839739e-07, | |
| "loss": 0.0982, | |
| "reward": 1.229166716337204, | |
| "reward_std": 0.47030629962682724, | |
| "rewards/accuracy_reward": 0.5312500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.5208587646484, | |
| "epoch": 0.6528, | |
| "grad_norm": 3.8971154349180575, | |
| "kl": 1.0986328125, | |
| "learning_rate": 3.906693487715358e-07, | |
| "loss": 0.1313, | |
| "reward": 1.138020858168602, | |
| "reward_std": 0.47273699939250946, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.731770858168602, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 671.8229370117188, | |
| "epoch": 0.6549333333333334, | |
| "grad_norm": 4.1318161142771395, | |
| "kl": 1.1552734375, | |
| "learning_rate": 3.875333493906889e-07, | |
| "loss": 0.0962, | |
| "reward": 1.1223958730697632, | |
| "reward_std": 0.5251179039478302, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7057291865348816, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.8854370117188, | |
| "epoch": 0.6570666666666667, | |
| "grad_norm": 2.0719389846315903, | |
| "kl": 1.109375, | |
| "learning_rate": 3.844063968676747e-07, | |
| "loss": 0.1291, | |
| "reward": 1.1276042014360428, | |
| "reward_std": 0.47504569590091705, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7317708432674408, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.5521011352539, | |
| "epoch": 0.6592, | |
| "grad_norm": 7.764838278480252, | |
| "kl": 0.81787109375, | |
| "learning_rate": 3.8128866532496575e-07, | |
| "loss": 0.0984, | |
| "reward": 1.3619791865348816, | |
| "reward_std": 0.4738384932279587, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7786458432674408, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 686.8854370117188, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 6.011262528901589, | |
| "kl": 0.507568359375, | |
| "learning_rate": 3.7818032837157e-07, | |
| "loss": 0.0566, | |
| "reward": 1.3359375298023224, | |
| "reward_std": 0.4312174804508686, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7734375149011612, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 779.8229217529297, | |
| "epoch": 0.6634666666666666, | |
| "grad_norm": 1.9236708416924095, | |
| "kl": 1.16015625, | |
| "learning_rate": 3.7508155909336324e-07, | |
| "loss": 0.0663, | |
| "reward": 0.9791667014360428, | |
| "reward_std": 0.4290865398943424, | |
| "rewards/accuracy_reward": 0.3020833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.677083358168602, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 691.8541870117188, | |
| "epoch": 0.6656, | |
| "grad_norm": 4.249549102956239, | |
| "kl": 1.0087890625, | |
| "learning_rate": 3.719925300434511e-07, | |
| "loss": 0.0839, | |
| "reward": 1.213541716337204, | |
| "reward_std": 0.4954788386821747, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750298023224, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.5833587646484, | |
| "epoch": 0.6677333333333333, | |
| "grad_norm": 6.248005893026083, | |
| "kl": 1.12109375, | |
| "learning_rate": 3.6891341323256044e-07, | |
| "loss": 0.1242, | |
| "reward": 1.1536458730697632, | |
| "reward_std": 0.44258614629507065, | |
| "rewards/accuracy_reward": 0.4166666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7369791865348816, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.7500305175781, | |
| "epoch": 0.6698666666666667, | |
| "grad_norm": 3.0058889714754575, | |
| "kl": 1.474609375, | |
| "learning_rate": 3.6584438011946093e-07, | |
| "loss": 0.0957, | |
| "reward": 1.1041667014360428, | |
| "reward_std": 0.5252336710691452, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000298023224, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 695.4791870117188, | |
| "epoch": 0.672, | |
| "grad_norm": 1.4454918734522983, | |
| "kl": 1.283203125, | |
| "learning_rate": 3.627856016014177e-07, | |
| "loss": 0.044, | |
| "reward": 1.0625000298023224, | |
| "reward_std": 0.4147951230406761, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 702.3125152587891, | |
| "epoch": 0.6741333333333334, | |
| "grad_norm": 3.5017500820603495, | |
| "kl": 1.1630859375, | |
| "learning_rate": 3.5973724800467487e-07, | |
| "loss": 0.0894, | |
| "reward": 0.997395858168602, | |
| "reward_std": 0.5040253773331642, | |
| "rewards/accuracy_reward": 0.2708333460614085, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 795.8437805175781, | |
| "epoch": 0.6762666666666667, | |
| "grad_norm": 2.726416422984257, | |
| "kl": 1.3046875, | |
| "learning_rate": 3.5669948907497106e-07, | |
| "loss": 0.0359, | |
| "reward": 0.9765625149011612, | |
| "reward_std": 0.5282137468457222, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.653645858168602, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 811.9062652587891, | |
| "epoch": 0.6784, | |
| "grad_norm": 2.1169915873642635, | |
| "kl": 1.3486328125, | |
| "learning_rate": 3.536724939680873e-07, | |
| "loss": 0.0274, | |
| "reward": 0.966145858168602, | |
| "reward_std": 0.43981292843818665, | |
| "rewards/accuracy_reward": 0.2604166707023978, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7057291716337204, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 729.2812652587891, | |
| "epoch": 0.6805333333333333, | |
| "grad_norm": 4.476093612695441, | |
| "kl": 1.8857421875, | |
| "learning_rate": 3.506564312404274e-07, | |
| "loss": 0.0958, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.4137794151902199, | |
| "rewards/accuracy_reward": 0.2916666828095913, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000149011612, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.7812652587891, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 3.8611937456604775, | |
| "kl": 1.583984375, | |
| "learning_rate": 3.476514688396326e-07, | |
| "loss": 0.0061, | |
| "reward": 1.151041716337204, | |
| "reward_std": 0.4715312048792839, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.692708358168602, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.6354370117188, | |
| "epoch": 0.6848, | |
| "grad_norm": 2.2123043385070162, | |
| "kl": 1.3046875, | |
| "learning_rate": 3.446577740952291e-07, | |
| "loss": 0.0959, | |
| "reward": 1.114583358168602, | |
| "reward_std": 0.5834785029292107, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 748.6771087646484, | |
| "epoch": 0.6869333333333333, | |
| "grad_norm": 3.373430717613711, | |
| "kl": 1.19140625, | |
| "learning_rate": 3.416755137093095e-07, | |
| "loss": 0.0787, | |
| "reward": 1.0859375298023224, | |
| "reward_std": 0.4723443537950516, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6692708432674408, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.6146087646484, | |
| "epoch": 0.6890666666666667, | |
| "grad_norm": 3.3096316727110078, | |
| "kl": 1.2021484375, | |
| "learning_rate": 3.387048537472521e-07, | |
| "loss": 0.0327, | |
| "reward": 1.0078125298023224, | |
| "reward_std": 0.3858821913599968, | |
| "rewards/accuracy_reward": 0.3333333386108279, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6744792014360428, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.6771087646484, | |
| "epoch": 0.6912, | |
| "grad_norm": 1.7472386206877715, | |
| "kl": 1.1787109375, | |
| "learning_rate": 3.3574595962847227e-07, | |
| "loss": 0.0701, | |
| "reward": 1.1171875596046448, | |
| "reward_std": 0.4745071604847908, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875298023224, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 730.9583435058594, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 2.2603113757427393, | |
| "kl": 1.01025390625, | |
| "learning_rate": 3.327989961172112e-07, | |
| "loss": 0.033, | |
| "reward": 1.213541716337204, | |
| "reward_std": 0.4763711243867874, | |
| "rewards/accuracy_reward": 0.4895833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7239583432674408, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.6562652587891, | |
| "epoch": 0.6954666666666667, | |
| "grad_norm": 5.159537210095775, | |
| "kl": 2.650390625, | |
| "learning_rate": 3.2986412731336175e-07, | |
| "loss": 0.1354, | |
| "reward": 0.9687500298023224, | |
| "reward_std": 0.4482138305902481, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.645833358168602, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 655.8125152587891, | |
| "epoch": 0.6976, | |
| "grad_norm": 2.0645410683703584, | |
| "kl": 1.736328125, | |
| "learning_rate": 3.2694151664332966e-07, | |
| "loss": 0.001, | |
| "reward": 1.1093750447034836, | |
| "reward_std": 0.5294991061091423, | |
| "rewards/accuracy_reward": 0.4270833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 707.5312805175781, | |
| "epoch": 0.6997333333333333, | |
| "grad_norm": 2.172931619200017, | |
| "kl": 1.5546875, | |
| "learning_rate": 3.240313268509345e-07, | |
| "loss": 0.1471, | |
| "reward": 1.0208333730697632, | |
| "reward_std": 0.477165549993515, | |
| "rewards/accuracy_reward": 0.3437500037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6770833432674408, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.5833587646484, | |
| "epoch": 0.7018666666666666, | |
| "grad_norm": 2.0240169487056354, | |
| "kl": 1.0498046875, | |
| "learning_rate": 3.211337199883467e-07, | |
| "loss": 0.0698, | |
| "reward": 1.4375, | |
| "reward_std": 0.5091161876916885, | |
| "rewards/accuracy_reward": 0.6458333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7916666865348816, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 730.8125152587891, | |
| "epoch": 0.704, | |
| "grad_norm": 2.315013907534432, | |
| "kl": 1.71875, | |
| "learning_rate": 3.182488574070632e-07, | |
| "loss": 0.0869, | |
| "reward": 0.9453125298023224, | |
| "reward_std": 0.5973611772060394, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6119792014360428, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 676.1146087646484, | |
| "epoch": 0.7061333333333333, | |
| "grad_norm": 2.457999078731587, | |
| "kl": 1.6494140625, | |
| "learning_rate": 3.153768997489239e-07, | |
| "loss": 0.0853, | |
| "reward": 1.1015625149011612, | |
| "reward_std": 0.48719264566898346, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7057291865348816, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 736.5312805175781, | |
| "epoch": 0.7082666666666667, | |
| "grad_norm": 2.6010104787226003, | |
| "kl": 1.080078125, | |
| "learning_rate": 3.1251800693716547e-07, | |
| "loss": 0.1232, | |
| "reward": 1.0963542014360428, | |
| "reward_std": 0.5459525063633919, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.700520858168602, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.5416717529297, | |
| "epoch": 0.7104, | |
| "grad_norm": 1.8362139405765112, | |
| "kl": 1.056640625, | |
| "learning_rate": 3.0967233816751655e-07, | |
| "loss": -0.0228, | |
| "reward": 1.3125000298023224, | |
| "reward_std": 0.55170738697052, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7708333432674408, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.0833587646484, | |
| "epoch": 0.7125333333333334, | |
| "grad_norm": 2.5638786630495436, | |
| "kl": 1.7734375, | |
| "learning_rate": 3.0684005189933314e-07, | |
| "loss": 0.0765, | |
| "reward": 1.1458333730697632, | |
| "reward_std": 0.4304089844226837, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.739583358168602, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 754.6458435058594, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 1.8865381891279744, | |
| "kl": 0.892578125, | |
| "learning_rate": 3.0402130584677456e-07, | |
| "loss": 0.1089, | |
| "reward": 0.997395858168602, | |
| "reward_std": 0.4055178463459015, | |
| "rewards/accuracy_reward": 0.3020833469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 771.9687652587891, | |
| "epoch": 0.7168, | |
| "grad_norm": 1.1455584808324157, | |
| "kl": 1.21875, | |
| "learning_rate": 3.012162569700208e-07, | |
| "loss": 0.0055, | |
| "reward": 1.0234375596046448, | |
| "reward_std": 0.4199650138616562, | |
| "rewards/accuracy_reward": 0.354166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.669270858168602, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 712.8333435058594, | |
| "epoch": 0.7189333333333333, | |
| "grad_norm": 2.342748172421835, | |
| "kl": 1.0478515625, | |
| "learning_rate": 2.984250614665339e-07, | |
| "loss": 0.0331, | |
| "reward": 1.0572917014360428, | |
| "reward_std": 0.5415750741958618, | |
| "rewards/accuracy_reward": 0.3750000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916716337204, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 795.4062652587891, | |
| "epoch": 0.7210666666666666, | |
| "grad_norm": 1.500532269463762, | |
| "kl": 1.345703125, | |
| "learning_rate": 2.9564787476235823e-07, | |
| "loss": 0.0801, | |
| "reward": 0.9765625447034836, | |
| "reward_std": 0.41543692350387573, | |
| "rewards/accuracy_reward": 0.2916666744276881, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6848958432674408, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 786.4167022705078, | |
| "epoch": 0.7232, | |
| "grad_norm": 3.819852222206904, | |
| "kl": 0.802734375, | |
| "learning_rate": 2.9288485150346726e-07, | |
| "loss": 0.0624, | |
| "reward": 1.013020858168602, | |
| "reward_std": 0.45187023282051086, | |
| "rewards/accuracy_reward": 0.3020833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375298023224, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.3125152587891, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 2.0201043749021426, | |
| "kl": 1.5703125, | |
| "learning_rate": 2.901361455471508e-07, | |
| "loss": 0.0987, | |
| "reward": 0.963541716337204, | |
| "reward_std": 0.44301638007164, | |
| "rewards/accuracy_reward": 0.2812500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.0104217529297, | |
| "epoch": 0.7274666666666667, | |
| "grad_norm": 1.3457584943918326, | |
| "kl": 0.85546875, | |
| "learning_rate": 2.87401909953449e-07, | |
| "loss": 0.0389, | |
| "reward": 1.0807292014360428, | |
| "reward_std": 0.41036995500326157, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958432674408, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 659.6041870117188, | |
| "epoch": 0.7296, | |
| "grad_norm": 2.879175690681913, | |
| "kl": 1.6728515625, | |
| "learning_rate": 2.8468229697662803e-07, | |
| "loss": 0.1076, | |
| "reward": 1.2031250596046448, | |
| "reward_std": 0.39927640557289124, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.786458358168602, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.3958435058594, | |
| "epoch": 0.7317333333333333, | |
| "grad_norm": 2.327499375188182, | |
| "kl": 0.76953125, | |
| "learning_rate": 2.819774580567027e-07, | |
| "loss": 0.1083, | |
| "reward": 1.0989583730697632, | |
| "reward_std": 0.41911373659968376, | |
| "rewards/accuracy_reward": 0.3645833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750298023224, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 770.8854370117188, | |
| "epoch": 0.7338666666666667, | |
| "grad_norm": 1.9515342429633127, | |
| "kl": 1.1884765625, | |
| "learning_rate": 2.792875438110033e-07, | |
| "loss": 0.0376, | |
| "reward": 1.0, | |
| "reward_std": 0.4482066258788109, | |
| "rewards/accuracy_reward": 0.3020833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 745.03125, | |
| "epoch": 0.736, | |
| "grad_norm": 2.778715625313689, | |
| "kl": 1.2744140625, | |
| "learning_rate": 2.766127040257884e-07, | |
| "loss": 0.0789, | |
| "reward": 1.153645858168602, | |
| "reward_std": 0.43238315731287, | |
| "rewards/accuracy_reward": 0.385416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7682292014360428, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 626.9166793823242, | |
| "epoch": 0.7381333333333333, | |
| "grad_norm": 2.061952546668428, | |
| "kl": 0.95703125, | |
| "learning_rate": 2.739530876479048e-07, | |
| "loss": 0.0938, | |
| "reward": 1.3880208730697632, | |
| "reward_std": 0.42479727417230606, | |
| "rewards/accuracy_reward": 0.5729166939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8151041865348816, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.4166717529297, | |
| "epoch": 0.7402666666666666, | |
| "grad_norm": 2.183821055027836, | |
| "kl": 0.6650390625, | |
| "learning_rate": 2.7130884277649214e-07, | |
| "loss": 0.0857, | |
| "reward": 1.3437500447034836, | |
| "reward_std": 0.4782296419143677, | |
| "rewards/accuracy_reward": 0.5729167014360428, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7708333432674408, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 736.6354370117188, | |
| "epoch": 0.7424, | |
| "grad_norm": 1.9371419950412827, | |
| "kl": 1.375, | |
| "learning_rate": 2.686801166547377e-07, | |
| "loss": 0.1098, | |
| "reward": 1.0026041865348816, | |
| "reward_std": 0.49701040238142014, | |
| "rewards/accuracy_reward": 0.3125000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6901041865348816, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.0625152587891, | |
| "epoch": 0.7445333333333334, | |
| "grad_norm": 1.7558253394051107, | |
| "kl": 1.203125, | |
| "learning_rate": 2.6606705566167674e-07, | |
| "loss": 0.0869, | |
| "reward": 1.143229216337204, | |
| "reward_std": 0.46467429399490356, | |
| "rewards/accuracy_reward": 0.4166666818782687, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.2812652587891, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 1.9709169916263403, | |
| "kl": 1.4423828125, | |
| "learning_rate": 2.6346980530404004e-07, | |
| "loss": 0.122, | |
| "reward": 1.0130208730697632, | |
| "reward_std": 0.4813590347766876, | |
| "rewards/accuracy_reward": 0.3125000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208432674408, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.7187652587891, | |
| "epoch": 0.7488, | |
| "grad_norm": 4.56297873066424, | |
| "kl": 2.19921875, | |
| "learning_rate": 2.6088851020815384e-07, | |
| "loss": 0.1464, | |
| "reward": 1.1302083432674408, | |
| "reward_std": 0.47396688908338547, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.723958358168602, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 747.7812652587891, | |
| "epoch": 0.7509333333333333, | |
| "grad_norm": 1.9631494506608835, | |
| "kl": 0.8564453125, | |
| "learning_rate": 2.5832331411188474e-07, | |
| "loss": 0.1031, | |
| "reward": 1.330729216337204, | |
| "reward_std": 0.5036925226449966, | |
| "rewards/accuracy_reward": 0.5416666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7890625, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 765.1771087646484, | |
| "epoch": 0.7530666666666667, | |
| "grad_norm": 2.3865628511024988, | |
| "kl": 0.70703125, | |
| "learning_rate": 2.557743598566361e-07, | |
| "loss": 0.0598, | |
| "reward": 1.2031250298023224, | |
| "reward_std": 0.38255368173122406, | |
| "rewards/accuracy_reward": 0.4479166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.755208358168602, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 746.8750152587891, | |
| "epoch": 0.7552, | |
| "grad_norm": 6.9166625658423655, | |
| "kl": 1.998046875, | |
| "learning_rate": 2.5324178937939436e-07, | |
| "loss": 0.1578, | |
| "reward": 1.174479216337204, | |
| "reward_std": 0.5697176456451416, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 639.7916870117188, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 2.425626939261767, | |
| "kl": 1.46435546875, | |
| "learning_rate": 2.507257437048249e-07, | |
| "loss": 0.0574, | |
| "reward": 1.2734375298023224, | |
| "reward_std": 0.3452693969011307, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7942708432674408, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 726.03125, | |
| "epoch": 0.7594666666666666, | |
| "grad_norm": 3.260344436398864, | |
| "kl": 1.20703125, | |
| "learning_rate": 2.482263629374197e-07, | |
| "loss": 0.0652, | |
| "reward": 1.0000000149011612, | |
| "reward_std": 0.49797503650188446, | |
| "rewards/accuracy_reward": 0.3020833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 739.7500152587891, | |
| "epoch": 0.7616, | |
| "grad_norm": 3.207979066227694, | |
| "kl": 1.52734375, | |
| "learning_rate": 2.4574378625369526e-07, | |
| "loss": 0.1102, | |
| "reward": 1.096354216337204, | |
| "reward_std": 0.5646106451749802, | |
| "rewards/accuracy_reward": 0.3958333469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.700520858168602, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.0312576293945, | |
| "epoch": 0.7637333333333334, | |
| "grad_norm": 5.24696288876381, | |
| "kl": 1.8115234375, | |
| "learning_rate": 2.432781518944425e-07, | |
| "loss": 0.1214, | |
| "reward": 1.2473958879709244, | |
| "reward_std": 0.3957555927336216, | |
| "rewards/accuracy_reward": 0.4895833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7578125149011612, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 712.7396087646484, | |
| "epoch": 0.7658666666666667, | |
| "grad_norm": 2.503201463336373, | |
| "kl": 1.0009765625, | |
| "learning_rate": 2.408295971570297e-07, | |
| "loss": 0.0721, | |
| "reward": 1.2005208730697632, | |
| "reward_std": 0.5229354798793793, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7630208730697632, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 740.5937805175781, | |
| "epoch": 0.768, | |
| "grad_norm": 2.7759740395623105, | |
| "kl": 2.009765625, | |
| "learning_rate": 2.3839825838775598e-07, | |
| "loss": 0.0173, | |
| "reward": 1.2500000447034836, | |
| "reward_std": 0.5089648514986038, | |
| "rewards/accuracy_reward": 0.5104166939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.739583358168602, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 739.4583435058594, | |
| "epoch": 0.7701333333333333, | |
| "grad_norm": 3.2691753356813398, | |
| "kl": 1.5390625, | |
| "learning_rate": 2.359842709742603e-07, | |
| "loss": 0.1274, | |
| "reward": 1.0546875447034836, | |
| "reward_std": 0.4684048518538475, | |
| "rewards/accuracy_reward": 0.36458334140479565, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6901041865348816, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.2812652587891, | |
| "epoch": 0.7722666666666667, | |
| "grad_norm": 3.5068196972207435, | |
| "kl": 1.0185546875, | |
| "learning_rate": 2.3358776933798163e-07, | |
| "loss": 0.1049, | |
| "reward": 1.2187500298023224, | |
| "reward_std": 0.5481763109564781, | |
| "rewards/accuracy_reward": 0.4687500298023224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7500000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 750.8229522705078, | |
| "epoch": 0.7744, | |
| "grad_norm": 3.44354047068292, | |
| "kl": 1.3896484375, | |
| "learning_rate": 2.3120888692667355e-07, | |
| "loss": 0.0889, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.4294763505458832, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.645833358168602, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 793.1770935058594, | |
| "epoch": 0.7765333333333333, | |
| "grad_norm": 3.8786939873244735, | |
| "kl": 1.611328125, | |
| "learning_rate": 2.2884775620697396e-07, | |
| "loss": 0.0847, | |
| "reward": 0.8437500298023224, | |
| "reward_std": 0.41653449833393097, | |
| "rewards/accuracy_reward": 0.25000000838190317, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5937500074505806, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.4583587646484, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 2.7534361801390785, | |
| "kl": 1.38671875, | |
| "learning_rate": 2.2650450865702873e-07, | |
| "loss": 0.0728, | |
| "reward": 1.0989583730697632, | |
| "reward_std": 0.6507840603590012, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7031250149011612, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 730.0208435058594, | |
| "epoch": 0.7808, | |
| "grad_norm": 1.7535269783976952, | |
| "kl": 0.80224609375, | |
| "learning_rate": 2.2417927475916948e-07, | |
| "loss": 0.0451, | |
| "reward": 1.2161458730697632, | |
| "reward_std": 0.3546758443117142, | |
| "rewards/accuracy_reward": 0.4583333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7578125298023224, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 774.8125305175781, | |
| "epoch": 0.7829333333333334, | |
| "grad_norm": 3.3253787412222624, | |
| "kl": 0.931640625, | |
| "learning_rate": 2.218721839926493e-07, | |
| "loss": 0.1163, | |
| "reward": 1.0703125447034836, | |
| "reward_std": 0.4362204819917679, | |
| "rewards/accuracy_reward": 0.3229166753590107, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958432674408, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 752.9375152587891, | |
| "epoch": 0.7850666666666667, | |
| "grad_norm": 1.4617385019781453, | |
| "kl": 1.5615234375, | |
| "learning_rate": 2.1958336482643119e-07, | |
| "loss": 0.08, | |
| "reward": 0.9192708805203438, | |
| "reward_std": 0.4304827004671097, | |
| "rewards/accuracy_reward": 0.27083334419876337, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375149011612, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 679.8958587646484, | |
| "epoch": 0.7872, | |
| "grad_norm": 2.585021511333178, | |
| "kl": 1.0693359375, | |
| "learning_rate": 2.173129447120354e-07, | |
| "loss": 0.0458, | |
| "reward": 1.1744792014360428, | |
| "reward_std": 0.34926126152276993, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7578125149011612, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 778.9062805175781, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 1.55450645453269, | |
| "kl": 0.763671875, | |
| "learning_rate": 2.1506105007644215e-07, | |
| "loss": 0.084, | |
| "reward": 1.0677083879709244, | |
| "reward_std": 0.340598925948143, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750149011612, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.7500152587891, | |
| "epoch": 0.7914666666666667, | |
| "grad_norm": 4.097252915121006, | |
| "kl": 0.8349609375, | |
| "learning_rate": 2.1282780631505106e-07, | |
| "loss": 0.0996, | |
| "reward": 1.2265625596046448, | |
| "reward_std": 0.4554222673177719, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7682291865348816, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 750.7708435058594, | |
| "epoch": 0.7936, | |
| "grad_norm": 2.7685818015071133, | |
| "kl": 0.6591796875, | |
| "learning_rate": 2.106133377846996e-07, | |
| "loss": 0.079, | |
| "reward": 1.1250000596046448, | |
| "reward_std": 0.3624304011464119, | |
| "rewards/accuracy_reward": 0.3645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7604166865348816, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 706.5520935058594, | |
| "epoch": 0.7957333333333333, | |
| "grad_norm": 1.4575123252714652, | |
| "kl": 0.8974609375, | |
| "learning_rate": 2.0841776779673712e-07, | |
| "loss": 0.1091, | |
| "reward": 1.119791716337204, | |
| "reward_std": 0.39200131222605705, | |
| "rewards/accuracy_reward": 0.39583334885537624, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.723958358168602, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 712.9583435058594, | |
| "epoch": 0.7978666666666666, | |
| "grad_norm": 2.232824880577458, | |
| "kl": 1.0927734375, | |
| "learning_rate": 2.0624121861015957e-07, | |
| "loss": 0.0676, | |
| "reward": 1.1640625298023224, | |
| "reward_std": 0.38127563893795013, | |
| "rewards/accuracy_reward": 0.4166666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958432674408, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 625.6771087646484, | |
| "epoch": 0.8, | |
| "grad_norm": 5.941213344681978, | |
| "kl": 1.75, | |
| "learning_rate": 2.040838114248009e-07, | |
| "loss": 0.0789, | |
| "reward": 1.3958333730697632, | |
| "reward_std": 0.35454390197992325, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7916666865348816, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.3020935058594, | |
| "epoch": 0.8021333333333334, | |
| "grad_norm": 5.357951916612967, | |
| "kl": 1.1103515625, | |
| "learning_rate": 2.019456663745839e-07, | |
| "loss": 0.1002, | |
| "reward": 1.190104216337204, | |
| "reward_std": 0.45377591252326965, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875149011612, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 647.6146087646484, | |
| "epoch": 0.8042666666666667, | |
| "grad_norm": 2.4257869176075304, | |
| "kl": 1.609375, | |
| "learning_rate": 1.9982690252083124e-07, | |
| "loss": 0.0367, | |
| "reward": 1.2239583730697632, | |
| "reward_std": 0.5055340826511383, | |
| "rewards/accuracy_reward": 0.5000000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7239583432674408, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 698.1041870117188, | |
| "epoch": 0.8064, | |
| "grad_norm": 1.2589748647570982, | |
| "kl": 0.4638671875, | |
| "learning_rate": 1.9772763784563515e-07, | |
| "loss": 0.0578, | |
| "reward": 1.354166716337204, | |
| "reward_std": 0.40980012714862823, | |
| "rewards/accuracy_reward": 0.5312500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8229166865348816, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.2500305175781, | |
| "epoch": 0.8085333333333333, | |
| "grad_norm": 1.6568574980602617, | |
| "kl": 0.8720703125, | |
| "learning_rate": 1.956479892452878e-07, | |
| "loss": 0.1696, | |
| "reward": 1.2265625149011612, | |
| "reward_std": 0.4330429956316948, | |
| "rewards/accuracy_reward": 0.4583333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7682291865348816, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 812.9062652587891, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 4.6751933457063455, | |
| "kl": 1.73291015625, | |
| "learning_rate": 1.9358807252377224e-07, | |
| "loss": 0.1125, | |
| "reward": 0.911458358168602, | |
| "reward_std": 0.3435791879892349, | |
| "rewards/accuracy_reward": 0.229166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 705.5729217529297, | |
| "epoch": 0.8128, | |
| "grad_norm": 2.202804636105119, | |
| "kl": 1.5283203125, | |
| "learning_rate": 1.915480023863134e-07, | |
| "loss": 0.119, | |
| "reward": 1.2317708730697632, | |
| "reward_std": 0.4287576675415039, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7630208432674408, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.1041870117188, | |
| "epoch": 0.8149333333333333, | |
| "grad_norm": 1.578475813955147, | |
| "kl": 0.9765625, | |
| "learning_rate": 1.895278924329914e-07, | |
| "loss": 0.1039, | |
| "reward": 1.0156250596046448, | |
| "reward_std": 0.5112209767103195, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 663.3021087646484, | |
| "epoch": 0.8170666666666667, | |
| "grad_norm": 1.7491393019341375, | |
| "kl": 0.58837890625, | |
| "learning_rate": 1.8752785515241533e-07, | |
| "loss": 0.0633, | |
| "reward": 1.3177083730697632, | |
| "reward_std": 0.4606628455221653, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7760417014360428, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 719.3541870117188, | |
| "epoch": 0.8192, | |
| "grad_norm": 2.126480419779782, | |
| "kl": 1.0810546875, | |
| "learning_rate": 1.8554800191545954e-07, | |
| "loss": 0.0575, | |
| "reward": 1.1562500298023224, | |
| "reward_std": 0.35498932003974915, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291666865348816, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 742.5937652587891, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 3.178627789937521, | |
| "kl": 1.314453125, | |
| "learning_rate": 1.8358844296906213e-07, | |
| "loss": 0.0802, | |
| "reward": 1.0807291865348816, | |
| "reward_std": 0.4337238222360611, | |
| "rewards/accuracy_reward": 0.3645833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 721.6458587646484, | |
| "epoch": 0.8234666666666667, | |
| "grad_norm": 22.434183591624898, | |
| "kl": 1.8828125, | |
| "learning_rate": 1.816492874300856e-07, | |
| "loss": 0.1242, | |
| "reward": 1.166666716337204, | |
| "reward_std": 0.5493014454841614, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291667014360428, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 684.4687805175781, | |
| "epoch": 0.8256, | |
| "grad_norm": 2.548088556426522, | |
| "kl": 0.99267578125, | |
| "learning_rate": 1.7973064327924126e-07, | |
| "loss": 0.0546, | |
| "reward": 1.1875000298023224, | |
| "reward_std": 0.5050683170557022, | |
| "rewards/accuracy_reward": 0.4270833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7604166865348816, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.2916717529297, | |
| "epoch": 0.8277333333333333, | |
| "grad_norm": 1.8889203482514176, | |
| "kl": 0.66015625, | |
| "learning_rate": 1.778326173550761e-07, | |
| "loss": 0.0335, | |
| "reward": 1.1171875447034836, | |
| "reward_std": 0.4616394564509392, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.763020858168602, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 746.7500152587891, | |
| "epoch": 0.8298666666666666, | |
| "grad_norm": 3.7971071317680223, | |
| "kl": 0.96484375, | |
| "learning_rate": 1.7595531534802315e-07, | |
| "loss": 0.1335, | |
| "reward": 1.1328125298023224, | |
| "reward_std": 0.34206103533506393, | |
| "rewards/accuracy_reward": 0.4062500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625298023224, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 735.3958435058594, | |
| "epoch": 0.832, | |
| "grad_norm": 1.7191547186183351, | |
| "kl": 1.2353515625, | |
| "learning_rate": 1.7409884179451712e-07, | |
| "loss": 0.0827, | |
| "reward": 1.0156250298023224, | |
| "reward_std": 0.463408388197422, | |
| "rewards/accuracy_reward": 0.3125000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7031250149011612, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 835.9479370117188, | |
| "epoch": 0.8341333333333333, | |
| "grad_norm": 2.4409318975848224, | |
| "kl": 1.857421875, | |
| "learning_rate": 1.722633000711723e-07, | |
| "loss": 0.0711, | |
| "reward": 0.9947917014360428, | |
| "reward_std": 0.42818325012922287, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.661458358168602, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 719.0208435058594, | |
| "epoch": 0.8362666666666667, | |
| "grad_norm": 2.650545804796997, | |
| "kl": 1.470703125, | |
| "learning_rate": 1.7044879238902673e-07, | |
| "loss": 0.0996, | |
| "reward": 1.1041667014360428, | |
| "reward_std": 0.5181029364466667, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000149011612, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 674.6771087646484, | |
| "epoch": 0.8384, | |
| "grad_norm": 3.0426392653315784, | |
| "kl": 1.955078125, | |
| "learning_rate": 1.6865541978785082e-07, | |
| "loss": 0.1673, | |
| "reward": 1.091145858168602, | |
| "reward_std": 0.5123106092214584, | |
| "rewards/accuracy_reward": 0.4062500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6848958432674408, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 685.5416870117188, | |
| "epoch": 0.8405333333333334, | |
| "grad_norm": 2.2798995105984106, | |
| "kl": 1.1328125, | |
| "learning_rate": 1.6688328213052017e-07, | |
| "loss": 0.1729, | |
| "reward": 1.0963541865348816, | |
| "reward_std": 0.4155949652194977, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541716337204, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 752.3125305175781, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 2.9684998404236205, | |
| "kl": 1.0869140625, | |
| "learning_rate": 1.6513247809745584e-07, | |
| "loss": 0.1393, | |
| "reward": 0.9869792014360428, | |
| "reward_std": 0.4611617475748062, | |
| "rewards/accuracy_reward": 0.3020833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.684895858168602, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 765.3333587646484, | |
| "epoch": 0.8448, | |
| "grad_norm": 2.267031288886419, | |
| "kl": 0.880859375, | |
| "learning_rate": 1.6340310518112837e-07, | |
| "loss": 0.0449, | |
| "reward": 1.1614583879709244, | |
| "reward_std": 0.5570357888936996, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7135416865348816, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 732.6562652587891, | |
| "epoch": 0.8469333333333333, | |
| "grad_norm": 2.968716228731711, | |
| "kl": 1.0556640625, | |
| "learning_rate": 1.6169525968062963e-07, | |
| "loss": 0.0572, | |
| "reward": 1.0911458432674408, | |
| "reward_std": 0.5004110783338547, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7161458432674408, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.0208587646484, | |
| "epoch": 0.8490666666666666, | |
| "grad_norm": 2.3709992838502982, | |
| "kl": 0.982421875, | |
| "learning_rate": 1.600090366963105e-07, | |
| "loss": 0.1097, | |
| "reward": 1.3515625298023224, | |
| "reward_std": 0.5505589917302132, | |
| "rewards/accuracy_reward": 0.5729166716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7786458432674408, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 754.0521087646484, | |
| "epoch": 0.8512, | |
| "grad_norm": 3.3869434945750236, | |
| "kl": 1.7177734375, | |
| "learning_rate": 1.5834453012448454e-07, | |
| "loss": 0.1107, | |
| "reward": 1.0156250149011612, | |
| "reward_std": 0.4361722990870476, | |
| "rewards/accuracy_reward": 0.3333333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6822916865348816, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 775.1458435058594, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.7552632015249623, | |
| "kl": 1.130859375, | |
| "learning_rate": 1.5670183265220044e-07, | |
| "loss": 0.0558, | |
| "reward": 0.9921875447034836, | |
| "reward_std": 0.4147378206253052, | |
| "rewards/accuracy_reward": 0.2916666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.700520858168602, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 695.0000305175781, | |
| "epoch": 0.8554666666666667, | |
| "grad_norm": 2.4422984120573465, | |
| "kl": 1.2822265625, | |
| "learning_rate": 1.5508103575207987e-07, | |
| "loss": 0.1304, | |
| "reward": 1.1901041865348816, | |
| "reward_std": 0.4788772165775299, | |
| "rewards/accuracy_reward": 0.4687500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541865348816, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 722.1354217529297, | |
| "epoch": 0.8576, | |
| "grad_norm": 2.3491045798533823, | |
| "kl": 0.9541015625, | |
| "learning_rate": 1.534822296772245e-07, | |
| "loss": 0.0731, | |
| "reward": 1.143229216337204, | |
| "reward_std": 0.4044281542301178, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7682291865348816, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 718.5104370117188, | |
| "epoch": 0.8597333333333333, | |
| "grad_norm": 1.4604861227005714, | |
| "kl": 0.861328125, | |
| "learning_rate": 1.519055034561902e-07, | |
| "loss": 0.1305, | |
| "reward": 1.1484375298023224, | |
| "reward_std": 0.551568478345871, | |
| "rewards/accuracy_reward": 0.4270833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541716337204, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 809.4271087646484, | |
| "epoch": 0.8618666666666667, | |
| "grad_norm": 1.9573260113015454, | |
| "kl": 1.150390625, | |
| "learning_rate": 1.5035094488802919e-07, | |
| "loss": 0.0859, | |
| "reward": 1.1328125298023224, | |
| "reward_std": 0.5649653822183609, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6953125149011612, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 708.5416870117188, | |
| "epoch": 0.864, | |
| "grad_norm": 1.8307966620112655, | |
| "kl": 0.994140625, | |
| "learning_rate": 1.488186405374015e-07, | |
| "loss": 0.0849, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.5416659787297249, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291666716337204, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 732.9791717529297, | |
| "epoch": 0.8661333333333333, | |
| "grad_norm": 1.5317692375594139, | |
| "kl": 0.8349609375, | |
| "learning_rate": 1.4730867572975427e-07, | |
| "loss": 0.1122, | |
| "reward": 1.1432291865348816, | |
| "reward_std": 0.4479290693998337, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.747395858168602, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.3021087646484, | |
| "epoch": 0.8682666666666666, | |
| "grad_norm": 3.057221820656228, | |
| "kl": 1.0791015625, | |
| "learning_rate": 1.4582113454657056e-07, | |
| "loss": 0.0773, | |
| "reward": 1.2578125298023224, | |
| "reward_std": 0.5970419347286224, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7890625298023224, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 632.8229370117188, | |
| "epoch": 0.8704, | |
| "grad_norm": 1.750668342116461, | |
| "kl": 1.111328125, | |
| "learning_rate": 1.4435609982068764e-07, | |
| "loss": 0.119, | |
| "reward": 1.3802083432674408, | |
| "reward_std": 0.47477005422115326, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7968750298023224, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.9479370117188, | |
| "epoch": 0.8725333333333334, | |
| "grad_norm": 2.053669308043276, | |
| "kl": 1.150390625, | |
| "learning_rate": 1.4291365313168391e-07, | |
| "loss": 0.0093, | |
| "reward": 1.1093750298023224, | |
| "reward_std": 0.4146904796361923, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750298023224, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.0416717529297, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 2.067203499829489, | |
| "kl": 1.23046875, | |
| "learning_rate": 1.4149387480133674e-07, | |
| "loss": -0.0144, | |
| "reward": 1.0260416865348816, | |
| "reward_std": 0.46504535526037216, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750149011612, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 782.4479370117188, | |
| "epoch": 0.8768, | |
| "grad_norm": 2.201137205854952, | |
| "kl": 1.279296875, | |
| "learning_rate": 1.4009684388914954e-07, | |
| "loss": 0.0488, | |
| "reward": 1.0208333730697632, | |
| "reward_std": 0.4742833971977234, | |
| "rewards/accuracy_reward": 0.3333333386108279, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6875000298023224, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 735.1146087646484, | |
| "epoch": 0.8789333333333333, | |
| "grad_norm": 2.0850893145919263, | |
| "kl": 0.9775390625, | |
| "learning_rate": 1.3872263818794915e-07, | |
| "loss": 0.054, | |
| "reward": 1.252604216337204, | |
| "reward_std": 0.47272689640522003, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7526042014360428, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.8958435058594, | |
| "epoch": 0.8810666666666667, | |
| "grad_norm": 3.903653333883459, | |
| "kl": 1.1669921875, | |
| "learning_rate": 1.3737133421955477e-07, | |
| "loss": 0.1438, | |
| "reward": 1.0859375596046448, | |
| "reward_std": 0.3797566667199135, | |
| "rewards/accuracy_reward": 0.3854166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208432674408, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.1771087646484, | |
| "epoch": 0.8832, | |
| "grad_norm": 3.256399521259395, | |
| "kl": 1.779296875, | |
| "learning_rate": 1.360430072305157e-07, | |
| "loss": 0.0931, | |
| "reward": 1.1171875298023224, | |
| "reward_std": 0.3713390752673149, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6901042014360428, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 755.0312728881836, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 2.7713487572106423, | |
| "kl": 1.193359375, | |
| "learning_rate": 1.3473773118792247e-07, | |
| "loss": 0.0627, | |
| "reward": 1.0729166865348816, | |
| "reward_std": 0.4558168202638626, | |
| "rewards/accuracy_reward": 0.34375001303851604, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291667014360428, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 714.4479370117188, | |
| "epoch": 0.8874666666666666, | |
| "grad_norm": 1.5917831263975801, | |
| "kl": 1.1884765625, | |
| "learning_rate": 1.3345557877528736e-07, | |
| "loss": 0.0228, | |
| "reward": 1.1536458730697632, | |
| "reward_std": 0.4172417223453522, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 729.0208511352539, | |
| "epoch": 0.8896, | |
| "grad_norm": 1.8346189483153255, | |
| "kl": 0.908203125, | |
| "learning_rate": 1.3219662138849704e-07, | |
| "loss": 0.0762, | |
| "reward": 1.143229216337204, | |
| "reward_std": 0.5342837795615196, | |
| "rewards/accuracy_reward": 0.4270833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 705.4271087646484, | |
| "epoch": 0.8917333333333334, | |
| "grad_norm": 3.0405441362276697, | |
| "kl": 0.93359375, | |
| "learning_rate": 1.309609291318374e-07, | |
| "loss": 0.0913, | |
| "reward": 1.2447917461395264, | |
| "reward_std": 0.48353345692157745, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7656250149011612, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 624.6771011352539, | |
| "epoch": 0.8938666666666667, | |
| "grad_norm": 2.4031973960368482, | |
| "kl": 0.90771484375, | |
| "learning_rate": 1.2974857081408933e-07, | |
| "loss": 0.1541, | |
| "reward": 1.1718750447034836, | |
| "reward_std": 0.44468991458415985, | |
| "rewards/accuracy_reward": 0.4166666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7552083432674408, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.5000152587891, | |
| "epoch": 0.896, | |
| "grad_norm": 2.7925764346431157, | |
| "kl": 1.61328125, | |
| "learning_rate": 1.2855961394469728e-07, | |
| "loss": 0.1007, | |
| "reward": 1.0364583730697632, | |
| "reward_std": 0.49245011806488037, | |
| "rewards/accuracy_reward": 0.3333333469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7031250298023224, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 708.2604217529297, | |
| "epoch": 0.8981333333333333, | |
| "grad_norm": 1.9701217013199572, | |
| "kl": 0.52099609375, | |
| "learning_rate": 1.2739412473001038e-07, | |
| "loss": 0.0371, | |
| "reward": 1.1796875298023224, | |
| "reward_std": 0.30052849650382996, | |
| "rewards/accuracy_reward": 0.3645833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8151042014360428, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.3020935058594, | |
| "epoch": 0.9002666666666667, | |
| "grad_norm": 2.082069714596071, | |
| "kl": 1.2578125, | |
| "learning_rate": 1.262521680695952e-07, | |
| "loss": 0.0728, | |
| "reward": 1.0182292014360428, | |
| "reward_std": 0.3820762485265732, | |
| "rewards/accuracy_reward": 0.2812500027939677, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7369791865348816, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 768.9166717529297, | |
| "epoch": 0.9024, | |
| "grad_norm": 3.3745301908072873, | |
| "kl": 1.65234375, | |
| "learning_rate": 1.251338075526224e-07, | |
| "loss": 0.045, | |
| "reward": 0.919270858168602, | |
| "reward_std": 0.40871209651231766, | |
| "rewards/accuracy_reward": 0.2708333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6484375149011612, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 645.3958435058594, | |
| "epoch": 0.9045333333333333, | |
| "grad_norm": 1.622420953888201, | |
| "kl": 0.919677734375, | |
| "learning_rate": 1.240391054543255e-07, | |
| "loss": 0.0611, | |
| "reward": 1.4062500447034836, | |
| "reward_std": 0.41048867627978325, | |
| "rewards/accuracy_reward": 0.5937500298023224, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8125000298023224, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.5729522705078, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 2.6284297956508955, | |
| "kl": 1.2958984375, | |
| "learning_rate": 1.2296812273253306e-07, | |
| "loss": 0.0822, | |
| "reward": 1.299479216337204, | |
| "reward_std": 0.45082786679267883, | |
| "rewards/accuracy_reward": 0.5104166939854622, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7890625149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 704.5833435058594, | |
| "epoch": 0.9088, | |
| "grad_norm": 1.5533935931581717, | |
| "kl": 1.2666015625, | |
| "learning_rate": 1.2192091902427471e-07, | |
| "loss": 0.0736, | |
| "reward": 1.166666716337204, | |
| "reward_std": 0.5383878573775291, | |
| "rewards/accuracy_reward": 0.447916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500298023224, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.4687652587891, | |
| "epoch": 0.9109333333333334, | |
| "grad_norm": 5.307925548469623, | |
| "kl": 1.8876953125, | |
| "learning_rate": 1.208975526424596e-07, | |
| "loss": 0.1743, | |
| "reward": 0.9661458730697632, | |
| "reward_std": 0.5588085800409317, | |
| "rewards/accuracy_reward": 0.3020833395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6640625149011612, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 695.4167022705078, | |
| "epoch": 0.9130666666666667, | |
| "grad_norm": 1.9477324310942372, | |
| "kl": 1.1796875, | |
| "learning_rate": 1.1989808057262999e-07, | |
| "loss": 0.1144, | |
| "reward": 1.2109375298023224, | |
| "reward_std": 0.6319162100553513, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7317708730697632, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 707.7708587646484, | |
| "epoch": 0.9152, | |
| "grad_norm": 1.630302912855386, | |
| "kl": 0.966796875, | |
| "learning_rate": 1.1892255846978763e-07, | |
| "loss": 0.1098, | |
| "reward": 1.2317708730697632, | |
| "reward_std": 0.5329065248370171, | |
| "rewards/accuracy_reward": 0.4791666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7526041716337204, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 641.4687805175781, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 1.503201933100271, | |
| "kl": 1.10888671875, | |
| "learning_rate": 1.179710406552947e-07, | |
| "loss": 0.1212, | |
| "reward": 1.2083333879709244, | |
| "reward_std": 0.5156615823507309, | |
| "rewards/accuracy_reward": 0.4895833358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500298023224, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 724.1979217529297, | |
| "epoch": 0.9194666666666667, | |
| "grad_norm": 3.6563152259064813, | |
| "kl": 1.2880859375, | |
| "learning_rate": 1.1704358011384915e-07, | |
| "loss": 0.0636, | |
| "reward": 0.9505208432674408, | |
| "reward_std": 0.4120257571339607, | |
| "rewards/accuracy_reward": 0.25, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7005208432674408, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 657.8229370117188, | |
| "epoch": 0.9216, | |
| "grad_norm": 1.9295206416493746, | |
| "kl": 1.548828125, | |
| "learning_rate": 1.161402284905339e-07, | |
| "loss": 0.1538, | |
| "reward": 1.1171875596046448, | |
| "reward_std": 0.5346331149339676, | |
| "rewards/accuracy_reward": 0.4270833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6901041865348816, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.7396240234375, | |
| "epoch": 0.9237333333333333, | |
| "grad_norm": 1.432096399760419, | |
| "kl": 1.0078125, | |
| "learning_rate": 1.1526103608794149e-07, | |
| "loss": 0.1123, | |
| "reward": 1.2187500596046448, | |
| "reward_std": 0.47654059529304504, | |
| "rewards/accuracy_reward": 0.4895833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7291666865348816, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 778.5729370117188, | |
| "epoch": 0.9258666666666666, | |
| "grad_norm": 2.6509053732035186, | |
| "kl": 1.279296875, | |
| "learning_rate": 1.1440605186337254e-07, | |
| "loss": -0.0008, | |
| "reward": 0.8645833730697632, | |
| "reward_std": 0.4510863274335861, | |
| "rewards/accuracy_reward": 0.1666666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 713.1250152587891, | |
| "epoch": 0.928, | |
| "grad_norm": 3.4533275842613125, | |
| "kl": 1.57421875, | |
| "learning_rate": 1.1357532342611005e-07, | |
| "loss": 0.0528, | |
| "reward": 1.1354167312383652, | |
| "reward_std": 0.490027979016304, | |
| "rewards/accuracy_reward": 0.3437500074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7916666865348816, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 737.1875152587891, | |
| "epoch": 0.9301333333333334, | |
| "grad_norm": 2.0824368172370975, | |
| "kl": 1.267578125, | |
| "learning_rate": 1.1276889703476789e-07, | |
| "loss": 0.0309, | |
| "reward": 1.1197916716337204, | |
| "reward_std": 0.47420261800289154, | |
| "rewards/accuracy_reward": 0.385416679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7343750298023224, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 751.4791717529297, | |
| "epoch": 0.9322666666666667, | |
| "grad_norm": 3.5797973969109487, | |
| "kl": 1.08984375, | |
| "learning_rate": 1.1198681759471522e-07, | |
| "loss": 0.0705, | |
| "reward": 1.0807292014360428, | |
| "reward_std": 0.40865882486104965, | |
| "rewards/accuracy_reward": 0.3645833460614085, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.716145858168602, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.7291870117188, | |
| "epoch": 0.9344, | |
| "grad_norm": 2.4222477889985123, | |
| "kl": 1.0146484375, | |
| "learning_rate": 1.1122912865557577e-07, | |
| "loss": 0.0581, | |
| "reward": 1.1380208730697632, | |
| "reward_std": 0.4775843694806099, | |
| "rewards/accuracy_reward": 0.3958333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7421875149011612, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.8750152587891, | |
| "epoch": 0.9365333333333333, | |
| "grad_norm": 2.9595686017050586, | |
| "kl": 1.4482421875, | |
| "learning_rate": 1.1049587240880295e-07, | |
| "loss": 0.0858, | |
| "reward": 1.0312500596046448, | |
| "reward_std": 0.5613971948623657, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6979166865348816, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 748.0104522705078, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 3.6804903948236647, | |
| "kl": 1.63671875, | |
| "learning_rate": 1.0978708968533028e-07, | |
| "loss": 0.049, | |
| "reward": 1.1744792014360428, | |
| "reward_std": 0.541608139872551, | |
| "rewards/accuracy_reward": 0.4375000111758709, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7369791716337204, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 821.8645935058594, | |
| "epoch": 0.9408, | |
| "grad_norm": 2.254543727913351, | |
| "kl": 1.068359375, | |
| "learning_rate": 1.0910281995329798e-07, | |
| "loss": 0.0314, | |
| "reward": 0.8046875, | |
| "reward_std": 0.37333502247929573, | |
| "rewards/accuracy_reward": 0.14583333861082792, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6588541865348816, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.3646087646484, | |
| "epoch": 0.9429333333333333, | |
| "grad_norm": 4.607468976961304, | |
| "kl": 0.91015625, | |
| "learning_rate": 1.0844310131585496e-07, | |
| "loss": 0.0748, | |
| "reward": 1.1744791865348816, | |
| "reward_std": 0.5283230543136597, | |
| "rewards/accuracy_reward": 0.4479166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625298023224, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.3229370117188, | |
| "epoch": 0.9450666666666667, | |
| "grad_norm": 1.1400175426656556, | |
| "kl": 1.2119140625, | |
| "learning_rate": 1.0780797050903712e-07, | |
| "loss": 0.0422, | |
| "reward": 1.2786459028720856, | |
| "reward_std": 0.37607645988464355, | |
| "rewards/accuracy_reward": 0.4687500149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8098958432674408, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.2812652587891, | |
| "epoch": 0.9472, | |
| "grad_norm": 1.9170494197283434, | |
| "kl": 1.0166015625, | |
| "learning_rate": 1.07197462899722e-07, | |
| "loss": 0.0662, | |
| "reward": 1.0625000298023224, | |
| "reward_std": 0.5107837617397308, | |
| "rewards/accuracy_reward": 0.34375000558793545, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500298023224, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 715.2604217529297, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 2.873741465652177, | |
| "kl": 1.048828125, | |
| "learning_rate": 1.0661161248365888e-07, | |
| "loss": 0.1132, | |
| "reward": 1.2265625596046448, | |
| "reward_std": 0.6806151270866394, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7265625149011612, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 790.3437652587891, | |
| "epoch": 0.9514666666666667, | |
| "grad_norm": 3.6992638630578956, | |
| "kl": 0.8779296875, | |
| "learning_rate": 1.0605045188357633e-07, | |
| "loss": 0.0711, | |
| "reward": 1.0208333879709244, | |
| "reward_std": 0.5120280906558037, | |
| "rewards/accuracy_reward": 0.3020833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7187500298023224, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.4479370117188, | |
| "epoch": 0.9536, | |
| "grad_norm": 1.6337261755318853, | |
| "kl": 1.05078125, | |
| "learning_rate": 1.0551401234736524e-07, | |
| "loss": 0.1, | |
| "reward": 1.0104167014360428, | |
| "reward_std": 0.4127493128180504, | |
| "rewards/accuracy_reward": 0.2708333469927311, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7395833730697632, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 704.1875152587891, | |
| "epoch": 0.9557333333333333, | |
| "grad_norm": 16.258107928681625, | |
| "kl": 1.107421875, | |
| "learning_rate": 1.0500232374633883e-07, | |
| "loss": 0.1343, | |
| "reward": 1.3098959028720856, | |
| "reward_std": 0.5429221987724304, | |
| "rewards/accuracy_reward": 0.5312500223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7786458432674408, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 736.5104370117188, | |
| "epoch": 0.9578666666666666, | |
| "grad_norm": 2.394106906459762, | |
| "kl": 1.3466796875, | |
| "learning_rate": 1.0451541457356948e-07, | |
| "loss": 0.0688, | |
| "reward": 1.1380208730697632, | |
| "reward_std": 0.5113592520356178, | |
| "rewards/accuracy_reward": 0.42708334885537624, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7109375149011612, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 710.6666870117188, | |
| "epoch": 0.96, | |
| "grad_norm": 2.0483608586867628, | |
| "kl": 1.013671875, | |
| "learning_rate": 1.0405331194230196e-07, | |
| "loss": 0.1184, | |
| "reward": 1.0963542014360428, | |
| "reward_std": 0.42430291324853897, | |
| "rewards/accuracy_reward": 0.3750000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7213541716337204, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 705.1458435058594, | |
| "epoch": 0.9621333333333333, | |
| "grad_norm": 1.6428274102372595, | |
| "kl": 1.072265625, | |
| "learning_rate": 1.036160415844436e-07, | |
| "loss": 0.0393, | |
| "reward": 1.2578125298023224, | |
| "reward_std": 0.5380661189556122, | |
| "rewards/accuracy_reward": 0.5104166865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7473958432674408, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 619.3437652587891, | |
| "epoch": 0.9642666666666667, | |
| "grad_norm": 1.7551241747148723, | |
| "kl": 0.943359375, | |
| "learning_rate": 1.0320362784913168e-07, | |
| "loss": 0.0358, | |
| "reward": 1.1901042014360428, | |
| "reward_std": 0.5258347168564796, | |
| "rewards/accuracy_reward": 0.4375000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7526041865348816, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 757.4166870117188, | |
| "epoch": 0.9664, | |
| "grad_norm": 1.5896793238771025, | |
| "kl": 0.7626953125, | |
| "learning_rate": 1.0281609370137723e-07, | |
| "loss": 0.0252, | |
| "reward": 1.1875000298023224, | |
| "reward_std": 0.4521985128521919, | |
| "rewards/accuracy_reward": 0.4166666828095913, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.770833358168602, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 627.6041793823242, | |
| "epoch": 0.9685333333333334, | |
| "grad_norm": 3.8311761420761203, | |
| "kl": 1.84375, | |
| "learning_rate": 1.024534607207864e-07, | |
| "loss": 0.1581, | |
| "reward": 1.0677083879709244, | |
| "reward_std": 0.5132449232041836, | |
| "rewards/accuracy_reward": 0.3958333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6718750298023224, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 751.3750305175781, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 3.41251835441772, | |
| "kl": 0.8310546875, | |
| "learning_rate": 1.0211574910035891e-07, | |
| "loss": 0.0948, | |
| "reward": 1.052083358168602, | |
| "reward_std": 0.43458399176597595, | |
| "rewards/accuracy_reward": 0.3125000102445483, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.739583358168602, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.3854217529297, | |
| "epoch": 0.9728, | |
| "grad_norm": 3.3487498391688364, | |
| "kl": 1.189453125, | |
| "learning_rate": 1.0180297764536348e-07, | |
| "loss": 0.0262, | |
| "reward": 1.0234375298023224, | |
| "reward_std": 0.5761856287717819, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6901041865348816, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.9896087646484, | |
| "epoch": 0.9749333333333333, | |
| "grad_norm": 1.4740237937952425, | |
| "kl": 1.1005859375, | |
| "learning_rate": 1.015151637722906e-07, | |
| "loss": 0.091, | |
| "reward": 0.9765625596046448, | |
| "reward_std": 0.485679030418396, | |
| "rewards/accuracy_reward": 0.2916666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.684895858168602, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.4375152587891, | |
| "epoch": 0.9770666666666666, | |
| "grad_norm": 2.088255511662504, | |
| "kl": 0.8876953125, | |
| "learning_rate": 1.0125232350788295e-07, | |
| "loss": 0.0873, | |
| "reward": 1.3671875596046448, | |
| "reward_std": 0.522374838590622, | |
| "rewards/accuracy_reward": 0.6041666865348816, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.763020858168602, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 668.4791870117188, | |
| "epoch": 0.9792, | |
| "grad_norm": 1.3331269505812111, | |
| "kl": 0.7607421875, | |
| "learning_rate": 1.0101447148824265e-07, | |
| "loss": 0.0169, | |
| "reward": 1.3880208730697632, | |
| "reward_std": 0.40413716435432434, | |
| "rewards/accuracy_reward": 0.572916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.8151041865348816, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.3125152587891, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 2.0621763744430535, | |
| "kl": 0.9912109375, | |
| "learning_rate": 1.0080162095801662e-07, | |
| "loss": 0.0811, | |
| "reward": 1.072916716337204, | |
| "reward_std": 0.4392261281609535, | |
| "rewards/accuracy_reward": 0.3645833432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.708333358168602, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 676.0312805175781, | |
| "epoch": 0.9834666666666667, | |
| "grad_norm": 1.8110615199199416, | |
| "kl": 1.4462890625, | |
| "learning_rate": 1.006137837696587e-07, | |
| "loss": 0.0877, | |
| "reward": 1.2109375298023224, | |
| "reward_std": 0.38446957617998123, | |
| "rewards/accuracy_reward": 0.4375000223517418, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7734375149011612, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.4687652587891, | |
| "epoch": 0.9856, | |
| "grad_norm": 1.726873645760791, | |
| "kl": 0.890625, | |
| "learning_rate": 1.0045097038276994e-07, | |
| "loss": 0.0598, | |
| "reward": 1.0833333730697632, | |
| "reward_std": 0.41529713198542595, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7083333432674408, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.5104370117188, | |
| "epoch": 0.9877333333333334, | |
| "grad_norm": 2.8491371575048827, | |
| "kl": 1.3125, | |
| "learning_rate": 1.0031318986351587e-07, | |
| "loss": 0.024, | |
| "reward": 1.0078125149011612, | |
| "reward_std": 0.46036942303180695, | |
| "rewards/accuracy_reward": 0.322916679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6848958507180214, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 635.7083435058594, | |
| "epoch": 0.9898666666666667, | |
| "grad_norm": 1.6047054865632053, | |
| "kl": 1.0810546875, | |
| "learning_rate": 1.0020044988412196e-07, | |
| "loss": 0.0451, | |
| "reward": 1.2734375298023224, | |
| "reward_std": 0.6459824442863464, | |
| "rewards/accuracy_reward": 0.4895833507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7838541865348816, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.1146087646484, | |
| "epoch": 0.992, | |
| "grad_norm": 4.339234215911267, | |
| "kl": 1.330078125, | |
| "learning_rate": 1.0011275672244634e-07, | |
| "loss": 0.0628, | |
| "reward": 1.1770833432674408, | |
| "reward_std": 0.4456011652946472, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7604166716337204, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 693.6875152587891, | |
| "epoch": 0.9941333333333333, | |
| "grad_norm": 1.617519930861052, | |
| "kl": 1.14453125, | |
| "learning_rate": 1.0005011526162988e-07, | |
| "loss": 0.0384, | |
| "reward": 1.1250000447034836, | |
| "reward_std": 0.5143625289201736, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7083333432674408, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 656.6145935058594, | |
| "epoch": 0.9962666666666666, | |
| "grad_norm": 3.2591696541640336, | |
| "kl": 0.826171875, | |
| "learning_rate": 1.0001252898982477e-07, | |
| "loss": 0.0027, | |
| "reward": 1.2786458730697632, | |
| "reward_std": 0.364062175154686, | |
| "rewards/accuracy_reward": 0.5000000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7786458432674408, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 764.1041870117188, | |
| "epoch": 0.9984, | |
| "grad_norm": 1.6261195919283198, | |
| "kl": 1.31689453125, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0665, | |
| "reward": 1.0260417014360428, | |
| "reward_std": 0.4544539228081703, | |
| "rewards/accuracy_reward": 0.3333333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6927083432674408, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "step": 468, | |
| "total_flos": 0.0, | |
| "train_loss": 0.08217587162211684, | |
| "train_runtime": 13418.4569, | |
| "train_samples_per_second": 0.559, | |
| "train_steps_per_second": 0.035 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 468, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |