{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998399743959033, "eval_steps": 500, "global_step": 3124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 779.4875244140625, "epoch": 0.0003200512081933109, "grad_norm": 0.0655626729130745, "kl": 0.0, "learning_rate": 6.389776357827476e-08, "loss": 0.0467, "reward": 0.2614583417773247, "reward_std": 0.32735898196697233, "rewards/accuracy_reward": 0.06875000279396773, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.18854167312383652, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 837.8854309082031, "epoch": 0.0006401024163866218, "grad_norm": 0.06388702988624573, "kl": 0.0, "learning_rate": 1.2779552715654952e-07, "loss": 0.0259, "reward": 0.2718750089406967, "reward_std": 0.2908510401844978, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.1635416716337204, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 781.164599609375, "epoch": 0.0009601536245799327, "grad_norm": 0.06498929113149643, "kl": 0.0002956547366920859, "learning_rate": 1.9169329073482428e-07, "loss": 0.0036, "reward": 0.22395834028720857, "reward_std": 0.30153340846300125, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.17395833730697632, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 788.1146057128906, "epoch": 0.0012802048327732437, "grad_norm": 0.07566636800765991, "kl": 0.00030002407875144853, "learning_rate": 2.5559105431309904e-07, "loss": 0.0183, "reward": 0.3145833443850279, "reward_std": 0.3022632598876953, "rewards/accuracy_reward": 0.08541666772216558, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.22500000558793545, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 766.7708557128906, "epoch": 0.0016002560409665546, "grad_norm": 0.0667392835021019, "kl": 0.0002937636716524139, "learning_rate": 3.194888178913738e-07, "loss": 0.0383, "reward": 0.2546875052154064, "reward_std": 0.26029517203569413, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.1796875037252903, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 743.1791931152344, "epoch": 0.0019203072491598655, "grad_norm": 0.07169407606124878, "kl": 0.0003271137073170394, "learning_rate": 3.8338658146964857e-07, "loss": 0.0153, "reward": 0.24218750596046448, "reward_std": 0.30807158052921296, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.18177083879709244, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 780.4708618164062, "epoch": 0.0022403584573531766, "grad_norm": 0.07363027334213257, "kl": 0.00030389373132493346, "learning_rate": 4.4728434504792333e-07, "loss": 0.0244, "reward": 0.20833334028720857, "reward_std": 0.308957539498806, "rewards/accuracy_reward": 0.043750000186264515, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.1625000037252903, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 748.7187683105469, "epoch": 0.0025604096655464873, "grad_norm": 0.07643438875675201, "kl": 0.00032084174163173886, "learning_rate": 5.111821086261981e-07, "loss": 0.0317, "reward": 0.32500000596046447, "reward_std": 0.326321017742157, "rewards/accuracy_reward": 0.09583333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2291666731238365, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 776.7271057128906, "epoch": 0.0028804608737397984, "grad_norm": 0.06638701260089874, "kl": 0.00030416845402214676, "learning_rate": 5.750798722044729e-07, "loss": 0.0329, "reward": 0.26510417461395264, "reward_std": 0.2857085719704628, "rewards/accuracy_reward": 0.11041666995733976, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.1526041716337204, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 757.7416931152344, "epoch": 0.003200512081933109, "grad_norm": 0.07611778378486633, "kl": 0.00030973673274274914, "learning_rate": 6.389776357827476e-07, "loss": 0.0533, "reward": 0.29062501043081285, "reward_std": 0.32584609389305114, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.22187500894069673, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 805.7187683105469, "epoch": 0.0035205632901264203, "grad_norm": 0.059567637741565704, "kl": 0.0002851913624908775, "learning_rate": 7.028753993610224e-07, "loss": 0.0066, "reward": 0.2348958395421505, "reward_std": 0.28508761525154114, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.1390625037252903, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 789.0291809082031, "epoch": 0.003840614498319731, "grad_norm": 0.06451418995857239, "kl": 0.00028673450433416294, "learning_rate": 7.667731629392971e-07, "loss": 0.0169, "reward": 0.2432291738688946, "reward_std": 0.2678666725754738, "rewards/accuracy_reward": 0.07083333451300859, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.17031250447034835, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 784.3416809082031, "epoch": 0.004160665706513042, "grad_norm": 0.06225666403770447, "kl": 0.00030059528799029065, "learning_rate": 8.306709265175719e-07, "loss": 0.0149, "reward": 0.22760416939854622, "reward_std": 0.27756396904587743, "rewards/accuracy_reward": 0.06041666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.16718750447034836, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 726.8312622070313, "epoch": 0.004480716914706353, "grad_norm": 0.0688437670469284, "kl": 0.00033944830065593126, "learning_rate": 8.945686900958467e-07, "loss": 0.0411, "reward": 0.2854166775941849, "reward_std": 0.28965494930744173, "rewards/accuracy_reward": 0.11250000428408384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1729166716337204, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 812.6875183105469, "epoch": 0.004800768122899664, "grad_norm": 0.06382700055837631, "kl": 0.0003171889838995412, "learning_rate": 9.584664536741215e-07, "loss": 0.0315, "reward": 0.21979167312383652, "reward_std": 0.251245941221714, "rewards/accuracy_reward": 0.045833333395421505, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.17187500596046448, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 789.4062683105469, "epoch": 0.005120819331092975, "grad_norm": 0.06542661041021347, "kl": 0.00029948877927381544, "learning_rate": 1.0223642172523962e-06, "loss": 0.0325, "reward": 0.31822917610406876, "reward_std": 0.29038531333208084, "rewards/accuracy_reward": 0.12708333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.19114584103226662, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 774.4479431152344, "epoch": 0.005440870539286286, "grad_norm": 0.0694902092218399, "kl": 0.0003498132777167484, "learning_rate": 1.086261980830671e-06, "loss": 0.0363, "reward": 0.21302083879709244, "reward_std": 0.27850082218647004, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2046875089406967, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 760.539599609375, "epoch": 0.005760921747479597, "grad_norm": 0.06334321200847626, "kl": 0.0003469670336926356, "learning_rate": 1.1501597444089457e-06, "loss": 0.0129, "reward": 0.27968751043081286, "reward_std": 0.3003611326217651, "rewards/accuracy_reward": 0.10000000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.17968750596046448, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 796.0041809082031, "epoch": 0.006080972955672908, "grad_norm": 0.06805295497179031, "kl": 0.0004138117627007887, "learning_rate": 1.2140575079872206e-06, "loss": 0.0203, "reward": 0.2723958395421505, "reward_std": 0.3050135537981987, "rewards/accuracy_reward": 0.0916666690260172, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1807291738688946, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 828.1062622070312, "epoch": 0.006401024163866218, "grad_norm": 0.06387760490179062, "kl": 0.00039581527817063035, "learning_rate": 1.2779552715654952e-06, "loss": 0.0482, "reward": 0.2500000037252903, "reward_std": 0.2750077828764915, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.17500000745058059, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 809.339599609375, "epoch": 0.00672107537205953, "grad_norm": 0.0663813054561615, "kl": 0.0004165978491073474, "learning_rate": 1.34185303514377e-06, "loss": 0.0227, "reward": 0.2078125037252903, "reward_std": 0.28486852943897245, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.15572917088866234, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 784.6770935058594, "epoch": 0.0070411265802528405, "grad_norm": 0.06903195381164551, "kl": 0.00047817713639233264, "learning_rate": 1.4057507987220447e-06, "loss": 0.0204, "reward": 0.3536458477377892, "reward_std": 0.3153432786464691, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2432291775941849, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 764.2896057128906, "epoch": 0.007361177788446151, "grad_norm": 0.06699788570404053, "kl": 0.0005332382366759703, "learning_rate": 1.4696485623003196e-06, "loss": 0.0319, "reward": 0.25468750968575476, "reward_std": 0.2953441575169563, "rewards/accuracy_reward": 0.06875000186264515, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.18385417237877846, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 773.4646057128906, "epoch": 0.007681228996639462, "grad_norm": 0.07254047691822052, "kl": 0.0006663853419013321, "learning_rate": 1.5335463258785943e-06, "loss": 0.0288, "reward": 0.2437500074505806, "reward_std": 0.32061070799827573, "rewards/accuracy_reward": 0.022916666977107526, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.21666667535901069, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 804.4416870117187, "epoch": 0.008001280204832774, "grad_norm": 0.07208665460348129, "kl": 0.0009051127126440406, "learning_rate": 1.5974440894568691e-06, "loss": 0.0228, "reward": 0.3687500134110451, "reward_std": 0.3236115902662277, "rewards/accuracy_reward": 0.11458333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2541666731238365, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 797.6062744140625, "epoch": 0.008321331413026083, "grad_norm": 0.07198836654424667, "kl": 0.0010994194832164793, "learning_rate": 1.6613418530351438e-06, "loss": 0.0327, "reward": 0.39322917759418485, "reward_std": 0.33683302253484726, "rewards/accuracy_reward": 0.15000000447034836, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.2369791693985462, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 741.5521057128906, "epoch": 0.008641382621219395, "grad_norm": 0.06907658278942108, "kl": 0.001576024480164051, "learning_rate": 1.7252396166134187e-06, "loss": 0.0406, "reward": 0.29739583730697633, "reward_std": 0.32967462539672854, "rewards/accuracy_reward": 0.0312500013038516, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.2619791716337204, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 727.1312683105468, "epoch": 0.008961433829412706, "grad_norm": 0.07349957525730133, "kl": 0.0019106465857475995, "learning_rate": 1.7891373801916933e-06, "loss": 0.0432, "reward": 0.42604167461395265, "reward_std": 0.36733110845088957, "rewards/accuracy_reward": 0.1291666690260172, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.29062501043081285, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 732.5708557128906, "epoch": 0.009281485037606016, "grad_norm": 0.07309871166944504, "kl": 0.0023886744515039028, "learning_rate": 1.8530351437699682e-06, "loss": 0.021, "reward": 0.313541679084301, "reward_std": 0.31117996871471404, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.2697916775941849, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 779.2083557128906, "epoch": 0.009601536245799328, "grad_norm": 0.06522124260663986, "kl": 0.002394045365508646, "learning_rate": 1.916932907348243e-06, "loss": 0.0413, "reward": 0.39635417610406876, "reward_std": 0.3403144717216492, "rewards/accuracy_reward": 0.09375000149011611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30260417312383653, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 750.5708557128906, "epoch": 0.00992158745399264, "grad_norm": 0.07314804196357727, "kl": 0.0028994579799473284, "learning_rate": 1.9808306709265175e-06, "loss": 0.059, "reward": 0.37916667461395265, "reward_std": 0.3659482032060623, "rewards/accuracy_reward": 0.04583333544433117, "rewards/format_reward": 0.00833333358168602, "rewards/tag_count_reward": 0.3250000089406967, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 739.9396179199218, "epoch": 0.01024163866218595, "grad_norm": 0.06802285462617874, "kl": 0.0030266973888501527, "learning_rate": 2.0447284345047924e-06, "loss": 0.0349, "reward": 0.42083333879709245, "reward_std": 0.33237463533878325, "rewards/accuracy_reward": 0.04583333358168602, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.36875001043081285, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 776.1646118164062, "epoch": 0.01056168987037926, "grad_norm": 0.061457838863134384, "kl": 0.0036948778200894595, "learning_rate": 2.1086261980830672e-06, "loss": 0.0446, "reward": 0.4390625059604645, "reward_std": 0.3595381796360016, "rewards/accuracy_reward": 0.06458333507180214, "rewards/format_reward": 0.00833333358168602, "rewards/tag_count_reward": 0.36614584624767305, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 796.0562683105469, "epoch": 0.010881741078572572, "grad_norm": 0.06410246342420578, "kl": 0.007469672057777643, "learning_rate": 2.172523961661342e-06, "loss": 0.0565, "reward": 0.43125001043081285, "reward_std": 0.3425430357456207, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.010416666977107525, "rewards/tag_count_reward": 0.3687500059604645, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 743.7812744140625, "epoch": 0.011201792286765882, "grad_norm": 0.06519825756549835, "kl": 0.007782880403101444, "learning_rate": 2.2364217252396165e-06, "loss": 0.0653, "reward": 0.45937501490116117, "reward_std": 0.36458621323108675, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.016666666977107523, "rewards/tag_count_reward": 0.37812501192092896, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 734.9021179199219, "epoch": 0.011521843494959194, "grad_norm": 0.07106705009937286, "kl": 0.010493304487317801, "learning_rate": 2.3003194888178914e-06, "loss": 0.0507, "reward": 0.4416666775941849, "reward_std": 0.361805260181427, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.014583333767950535, "rewards/tag_count_reward": 0.4125000059604645, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 745.5979370117187, "epoch": 0.011841894703152504, "grad_norm": 0.06895628571510315, "kl": 0.01310005160048604, "learning_rate": 2.3642172523961663e-06, "loss": 0.0644, "reward": 0.5255208402872086, "reward_std": 0.3466539680957794, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.45260417759418486, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 740.8541809082031, "epoch": 0.012161945911345815, "grad_norm": 0.06350603699684143, "kl": 0.01148996208794415, "learning_rate": 2.428115015974441e-06, "loss": 0.0569, "reward": 0.6489583432674408, "reward_std": 0.3286067843437195, "rewards/accuracy_reward": 0.11875000521540642, "rewards/format_reward": 0.018750000558793545, "rewards/tag_count_reward": 0.5114583402872086, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 753.3229309082031, "epoch": 0.012481997119539127, "grad_norm": 0.06634090095758438, "kl": 0.010962517792358994, "learning_rate": 2.4920127795527156e-06, "loss": 0.0411, "reward": 0.567187511920929, "reward_std": 0.32436338663101194, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.02083333395421505, "rewards/tag_count_reward": 0.5067708522081376, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 740.2521118164062, "epoch": 0.012802048327732437, "grad_norm": 0.060020193457603455, "kl": 0.012796282302588224, "learning_rate": 2.5559105431309904e-06, "loss": 0.0726, "reward": 0.5812500149011612, "reward_std": 0.3484797939658165, "rewards/accuracy_reward": 0.050000001676380634, "rewards/format_reward": 0.014583333767950535, "rewards/tag_count_reward": 0.5166666805744171, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 697.7000183105469, "epoch": 0.013122099535925748, "grad_norm": 0.05944250524044037, "kl": 0.012154347030445933, "learning_rate": 2.6198083067092657e-06, "loss": 0.1055, "reward": 0.6166666924953461, "reward_std": 0.31222147643566134, "rewards/accuracy_reward": 0.0645833346992731, "rewards/format_reward": 0.012500000186264515, "rewards/tag_count_reward": 0.5395833522081375, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 735.7458618164062, "epoch": 0.01344215074411906, "grad_norm": 0.06103678047657013, "kl": 0.014400722924619913, "learning_rate": 2.68370607028754e-06, "loss": 0.058, "reward": 0.6255208432674408, "reward_std": 0.3416227579116821, "rewards/accuracy_reward": 0.07083333507180214, "rewards/format_reward": 0.01250000037252903, "rewards/tag_count_reward": 0.542187511920929, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 775.9333618164062, "epoch": 0.01376220195231237, "grad_norm": 0.06868734210729599, "kl": 0.017303874902427196, "learning_rate": 2.747603833865815e-06, "loss": 0.0529, "reward": 0.6255208551883698, "reward_std": 0.32499729096889496, "rewards/accuracy_reward": 0.10625000204890966, "rewards/format_reward": 0.01875000037252903, "rewards/tag_count_reward": 0.5005208492279053, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 698.9979309082031, "epoch": 0.014082253160505681, "grad_norm": 0.06461925804615021, "kl": 0.014360193721950054, "learning_rate": 2.8115015974440895e-06, "loss": 0.0818, "reward": 0.610416692495346, "reward_std": 0.3274090111255646, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.01666666716337204, "rewards/tag_count_reward": 0.5416666805744171, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 738.0083618164062, "epoch": 0.014402304368698993, "grad_norm": 0.06297613680362701, "kl": 0.01128577790223062, "learning_rate": 2.8753993610223648e-06, "loss": 0.0643, "reward": 0.5531250178813935, "reward_std": 0.30965033173561096, "rewards/accuracy_reward": 0.0229166679084301, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.5239583522081375, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 787.9354431152344, "epoch": 0.014722355576892302, "grad_norm": 0.0712624341249466, "kl": 0.019073341879993676, "learning_rate": 2.9392971246006392e-06, "loss": 0.0794, "reward": 0.5593750149011611, "reward_std": 0.28996885418891905, "rewards/accuracy_reward": 0.04375000111758709, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.5093750178813934, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 749.1521057128906, "epoch": 0.015042406785085614, "grad_norm": 0.057314738631248474, "kl": 0.01389997247606516, "learning_rate": 3.003194888178914e-06, "loss": 0.0873, "reward": 0.6125000178813934, "reward_std": 0.3025829717516899, "rewards/accuracy_reward": 0.0875000024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5250000178813934, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 765.7541809082031, "epoch": 0.015362457993278924, "grad_norm": 0.05906614288687706, "kl": 0.01638176813721657, "learning_rate": 3.0670926517571885e-06, "loss": 0.0634, "reward": 0.6109375119209289, "reward_std": 0.2791178122162819, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.5588541865348816, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 740.683349609375, "epoch": 0.015682509201472235, "grad_norm": 0.06702172756195068, "kl": 0.017427592631429435, "learning_rate": 3.130990415335464e-06, "loss": 0.0844, "reward": 0.6395833522081376, "reward_std": 0.28103085309267045, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.5604166835546494, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 724.6375244140625, "epoch": 0.016002560409665547, "grad_norm": 0.07204456627368927, "kl": 0.02422009501606226, "learning_rate": 3.1948881789137383e-06, "loss": 0.0571, "reward": 0.6385416924953461, "reward_std": 0.3018873170018196, "rewards/accuracy_reward": 0.09166667088866234, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.5447916835546494, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 745.5000305175781, "epoch": 0.01632261161785886, "grad_norm": 0.06197218969464302, "kl": 0.015600860584527254, "learning_rate": 3.258785942492013e-06, "loss": 0.0704, "reward": 0.6192708551883698, "reward_std": 0.27808820456266403, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.5776041865348815, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 740.6771057128906, "epoch": 0.016642662826052167, "grad_norm": 0.05719318985939026, "kl": 0.016424881853163244, "learning_rate": 3.3226837060702876e-06, "loss": 0.0677, "reward": 0.6161458492279053, "reward_std": 0.2974912986159325, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.5932291805744171, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 713.7646118164063, "epoch": 0.016962714034245478, "grad_norm": 0.07055334746837616, "kl": 0.019731516763567925, "learning_rate": 3.386581469648563e-06, "loss": 0.1139, "reward": 0.6614583492279053, "reward_std": 0.26685925424098966, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6239583492279053, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 682.839599609375, "epoch": 0.01728276524243879, "grad_norm": 0.06527426093816757, "kl": 0.01968300249427557, "learning_rate": 3.4504792332268373e-06, "loss": 0.0842, "reward": 0.6651041865348816, "reward_std": 0.27405439764261247, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.6171875119209289, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 706.3312622070313, "epoch": 0.0176028164506321, "grad_norm": 0.06510436534881592, "kl": 0.019583940878510474, "learning_rate": 3.514376996805112e-06, "loss": 0.091, "reward": 0.6786458641290665, "reward_std": 0.2530492454767227, "rewards/accuracy_reward": 0.07708333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625268220901, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 705.5250244140625, "epoch": 0.017922867658825413, "grad_norm": 0.07424038648605347, "kl": 0.02574802339076996, "learning_rate": 3.5782747603833866e-06, "loss": 0.0836, "reward": 0.7000000178813934, "reward_std": 0.2880414813756943, "rewards/accuracy_reward": 0.06875000111758708, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.631250011920929, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 626.9562622070313, "epoch": 0.018242918867018725, "grad_norm": 0.07916589826345444, "kl": 0.022329603042453527, "learning_rate": 3.642172523961662e-06, "loss": 0.0914, "reward": 0.7244791865348816, "reward_std": 0.24143780022859573, "rewards/accuracy_reward": 0.04583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6786458551883697, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 663.2021118164063, "epoch": 0.018562970075212033, "grad_norm": 0.0976390466094017, "kl": 0.025962639041244982, "learning_rate": 3.7060702875399364e-06, "loss": 0.1237, "reward": 0.7281250178813934, "reward_std": 0.2525856912136078, "rewards/accuracy_reward": 0.03958333358168602, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6864583551883697, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 700.6666931152344, "epoch": 0.018883021283405344, "grad_norm": 0.06289377063512802, "kl": 0.0208747168071568, "learning_rate": 3.7699680511182112e-06, "loss": 0.0846, "reward": 0.6458333492279053, "reward_std": 0.2825623080134392, "rewards/accuracy_reward": 0.016666666977107523, "rewards/format_reward": 0.00833333358168602, "rewards/tag_count_reward": 0.6208333432674408, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 665.733349609375, "epoch": 0.019203072491598656, "grad_norm": 0.07116346806287766, "kl": 0.023737166076898575, "learning_rate": 3.833865814696486e-06, "loss": 0.0857, "reward": 0.7718750238418579, "reward_std": 0.26197345554828644, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6510416865348816, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 697.3125244140625, "epoch": 0.019523123699791967, "grad_norm": 0.061922844499349594, "kl": 0.023615499306470156, "learning_rate": 3.8977635782747605e-06, "loss": 0.0684, "reward": 0.6494791865348816, "reward_std": 0.25366342514753343, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.6307291805744171, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 706.2312683105469, "epoch": 0.01984317490798528, "grad_norm": 0.06591752171516418, "kl": 0.019074952974915505, "learning_rate": 3.961661341853035e-06, "loss": 0.0935, "reward": 0.6859375298023224, "reward_std": 0.2340133711695671, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6442708551883698, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 691.6312683105468, "epoch": 0.020163226116178587, "grad_norm": 0.06416673958301544, "kl": 0.024632269330322743, "learning_rate": 4.02555910543131e-06, "loss": 0.1397, "reward": 0.7000000238418579, "reward_std": 0.251662477850914, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6437500238418579, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 689.0104370117188, "epoch": 0.0204832773243719, "grad_norm": 0.06594003736972809, "kl": 0.023258844204247, "learning_rate": 4.089456869009585e-06, "loss": 0.1066, "reward": 0.7401041805744171, "reward_std": 0.25073023736476896, "rewards/accuracy_reward": 0.08333333358168601, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6567708492279053, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 698.4687683105469, "epoch": 0.02080332853256521, "grad_norm": 0.06369701772928238, "kl": 0.02361576007679105, "learning_rate": 4.15335463258786e-06, "loss": 0.1014, "reward": 0.6906250238418579, "reward_std": 0.23723849654197693, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6489583551883698, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 631.3666870117188, "epoch": 0.02112337974075852, "grad_norm": 0.0697370246052742, "kl": 0.027477294206619263, "learning_rate": 4.2172523961661345e-06, "loss": 0.0981, "reward": 0.7203125119209289, "reward_std": 0.27272156327962876, "rewards/accuracy_reward": 0.03333333451300859, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.6828125178813934, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 657.4875183105469, "epoch": 0.021443430948951833, "grad_norm": 0.07227819412946701, "kl": 0.022128170542418956, "learning_rate": 4.28115015974441e-06, "loss": 0.1108, "reward": 0.7484375238418579, "reward_std": 0.2265900582075119, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6713541865348815, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 605.1854309082031, "epoch": 0.021763482157145145, "grad_norm": 0.08304242789745331, "kl": 0.036028834991157055, "learning_rate": 4.345047923322684e-06, "loss": 0.1116, "reward": 0.8598958671092987, "reward_std": 0.24308189302682875, "rewards/accuracy_reward": 0.14791667200624942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7119791865348816, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 631.5979431152343, "epoch": 0.022083533365338453, "grad_norm": 0.07294842600822449, "kl": 0.028276703879237176, "learning_rate": 4.408945686900959e-06, "loss": 0.0673, "reward": 0.720312523841858, "reward_std": 0.2192530706524849, "rewards/accuracy_reward": 0.012500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7078125238418579, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 672.1000061035156, "epoch": 0.022403584573531764, "grad_norm": 0.07134870439767838, "kl": 0.025765881687402726, "learning_rate": 4.472843450479233e-06, "loss": 0.0862, "reward": 0.7296875178813934, "reward_std": 0.24120083600282669, "rewards/accuracy_reward": 0.02500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7046875178813934, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 638.2479309082031, "epoch": 0.022723635781725076, "grad_norm": 0.07261071354150772, "kl": 0.036591825634241106, "learning_rate": 4.536741214057508e-06, "loss": 0.0825, "reward": 0.8052083551883698, "reward_std": 0.22306446582078934, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.7406250178813935, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 643.6604248046875, "epoch": 0.023043686989918388, "grad_norm": 0.07969695329666138, "kl": 0.03463496062904596, "learning_rate": 4.600638977635783e-06, "loss": 0.0973, "reward": 0.8437500298023224, "reward_std": 0.2542044401168823, "rewards/accuracy_reward": 0.10000000316649675, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.7395833492279053, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 653.0833557128906, "epoch": 0.0233637381981117, "grad_norm": 0.07199858874082565, "kl": 0.031290368735790254, "learning_rate": 4.664536741214058e-06, "loss": 0.0653, "reward": 0.8010416924953461, "reward_std": 0.21112514436244964, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7322916865348816, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 671.058349609375, "epoch": 0.023683789406305007, "grad_norm": 0.08724746108055115, "kl": 0.04281867854297161, "learning_rate": 4.7284345047923325e-06, "loss": 0.0785, "reward": 0.7541666924953461, "reward_std": 0.22042571306228637, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.7145833551883698, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 697.4229309082032, "epoch": 0.02400384061449832, "grad_norm": 0.07457519322633743, "kl": 0.03650112468749285, "learning_rate": 4.792332268370608e-06, "loss": 0.0906, "reward": 0.8213541805744171, "reward_std": 0.2625132277607918, "rewards/accuracy_reward": 0.12083333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7005208551883697, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 652.5500091552734, "epoch": 0.02432389182269163, "grad_norm": 0.07498233765363693, "kl": 0.03543906323611736, "learning_rate": 4.856230031948882e-06, "loss": 0.095, "reward": 0.7598958492279053, "reward_std": 0.22588661164045334, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7411458492279053, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 677.7000183105469, "epoch": 0.024643943030884942, "grad_norm": 0.09663916379213333, "kl": 0.04480956122279167, "learning_rate": 4.920127795527157e-06, "loss": 0.1344, "reward": 0.7166666865348816, "reward_std": 0.2699084341526031, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.697916692495346, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 658.0666748046875, "epoch": 0.024963994239078253, "grad_norm": 0.08884267508983612, "kl": 0.06787048671394587, "learning_rate": 4.984025559105431e-06, "loss": 0.0796, "reward": 0.7500000238418579, "reward_std": 0.20937047749757767, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.7375000178813934, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 653.7562683105468, "epoch": 0.025284045447271565, "grad_norm": 0.12078572064638138, "kl": 0.04476796705275774, "learning_rate": 5.0479233226837065e-06, "loss": 0.1238, "reward": 0.8010416865348816, "reward_std": 0.28335138112306596, "rewards/accuracy_reward": 0.06875000111758708, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.7302083551883698, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 663.2187744140625, "epoch": 0.025604096655464873, "grad_norm": 0.9814605712890625, "kl": 0.16789422370493412, "learning_rate": 5.111821086261981e-06, "loss": 0.1109, "reward": 0.8114583671092988, "reward_std": 0.26037254482507705, "rewards/accuracy_reward": 0.07708333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7343750238418579, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 645.8104309082031, "epoch": 0.025924147863658185, "grad_norm": 0.10365696996450424, "kl": 0.04504641108214855, "learning_rate": 5.175718849840255e-06, "loss": 0.1185, "reward": 0.7807291984558106, "reward_std": 0.2670013889670372, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7432291865348816, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 679.1979431152344, "epoch": 0.026244199071851496, "grad_norm": 0.10205037146806717, "kl": 0.04317870959639549, "learning_rate": 5.2396166134185315e-06, "loss": 0.124, "reward": 0.7625000178813934, "reward_std": 0.26102137863636016, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.7208333551883698, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 663.708349609375, "epoch": 0.026564250280044808, "grad_norm": 0.09976530820131302, "kl": 0.04077131990343332, "learning_rate": 5.303514376996806e-06, "loss": 0.1011, "reward": 0.8312500178813934, "reward_std": 0.2625503420829773, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7479166865348816, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 633.995849609375, "epoch": 0.02688430148823812, "grad_norm": 0.11126335710287094, "kl": 0.04656725451350212, "learning_rate": 5.36741214057508e-06, "loss": 0.1043, "reward": 0.7817708551883698, "reward_std": 0.2660654917359352, "rewards/accuracy_reward": 0.06041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541865348816, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 640.8979370117188, "epoch": 0.027204352696431428, "grad_norm": 0.1051332950592041, "kl": 0.04987532235682011, "learning_rate": 5.431309904153355e-06, "loss": 0.127, "reward": 0.7156250238418579, "reward_std": 0.253768752515316, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.7093750238418579, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 649.9750183105468, "epoch": 0.02752440390462474, "grad_norm": 0.1315613090991974, "kl": 0.049108054488897324, "learning_rate": 5.49520766773163e-06, "loss": 0.1478, "reward": 0.6697916865348816, "reward_std": 0.3035299152135849, "rewards/accuracy_reward": 0.014583333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6552083551883697, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 686.2729309082031, "epoch": 0.02784445511281805, "grad_norm": 0.15391191840171814, "kl": 0.055657780915498736, "learning_rate": 5.5591054313099045e-06, "loss": 0.1489, "reward": 0.6500000208616257, "reward_std": 0.29610070735216143, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6166666835546494, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 656.4562683105469, "epoch": 0.028164506321011362, "grad_norm": 0.11603273451328278, "kl": 0.04724425338208675, "learning_rate": 5.623003194888179e-06, "loss": 0.1022, "reward": 0.6697916924953461, "reward_std": 0.2515522539615631, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6489583492279053, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 696.1625183105468, "epoch": 0.028484557529204674, "grad_norm": 0.15315236151218414, "kl": 0.04981156475841999, "learning_rate": 5.6869009584664534e-06, "loss": 0.0933, "reward": 0.665104192495346, "reward_std": 0.27600702494382856, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6192708611488342, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 695.0104431152344, "epoch": 0.028804608737397985, "grad_norm": 0.14363472163677216, "kl": 0.048564912378787996, "learning_rate": 5.7507987220447296e-06, "loss": 0.0629, "reward": 0.6744791805744171, "reward_std": 0.2537939205765724, "rewards/accuracy_reward": 0.039583333395421506, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6348958432674408, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 675.5833557128906, "epoch": 0.029124659945591293, "grad_norm": 0.24816231429576874, "kl": 0.07004429288208484, "learning_rate": 5.814696485623004e-06, "loss": 0.0738, "reward": 0.6838541865348816, "reward_std": 0.2542484775185585, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6401041924953461, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 707.4437683105468, "epoch": 0.029444711153784605, "grad_norm": 0.35220104455947876, "kl": 0.08218934014439583, "learning_rate": 5.8785942492012785e-06, "loss": 0.0353, "reward": 0.6536458551883697, "reward_std": 0.2631165474653244, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6140625178813934, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 640.3979370117188, "epoch": 0.029764762361977917, "grad_norm": 0.48827409744262695, "kl": 0.1260451439768076, "learning_rate": 5.942492012779553e-06, "loss": 0.0827, "reward": 0.658854192495346, "reward_std": 0.26948108375072477, "rewards/accuracy_reward": 0.022916667349636554, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6359375238418579, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 688.8896057128907, "epoch": 0.030084813570171228, "grad_norm": 0.6792539358139038, "kl": 0.1645615816116333, "learning_rate": 6.006389776357828e-06, "loss": 0.0582, "reward": 0.7057291865348816, "reward_std": 0.27467391192913054, "rewards/accuracy_reward": 0.08125000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6244791924953461, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 699.545849609375, "epoch": 0.03040486477836454, "grad_norm": 0.7102315425872803, "kl": 0.24729929864406586, "learning_rate": 6.070287539936103e-06, "loss": -0.0393, "reward": 0.6776041984558105, "reward_std": 0.2612756446003914, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6046875298023224, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 649.2875183105468, "epoch": 0.030724915986557848, "grad_norm": 1.246417760848999, "kl": 0.40844622552394866, "learning_rate": 6.134185303514377e-06, "loss": -0.068, "reward": 0.657812523841858, "reward_std": 0.2727681741118431, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.6078125238418579, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 605.0271118164062, "epoch": 0.03104496719475116, "grad_norm": 3.4409334659576416, "kl": 1.8638823270797729, "learning_rate": 6.1980830670926515e-06, "loss": -0.1356, "reward": 0.6661458492279053, "reward_std": 0.28613357841968534, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5911458492279053, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 651.3687622070313, "epoch": 0.03136501840294447, "grad_norm": 6.783247470855713, "kl": 0.931004011631012, "learning_rate": 6.261980830670928e-06, "loss": -0.19, "reward": 0.6791666865348815, "reward_std": 0.30113149881362916, "rewards/accuracy_reward": 0.0937500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5854166805744171, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 565.1354309082031, "epoch": 0.03168506961113778, "grad_norm": 0.5530074238777161, "kl": 0.5937871515750885, "learning_rate": 6.325878594249202e-06, "loss": -0.2302, "reward": 0.5416666805744171, "reward_std": 0.3111159473657608, "rewards/accuracy_reward": 0.016666667349636555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5250000119209289, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 615.7812622070312, "epoch": 0.032005120819331094, "grad_norm": 0.4718743860721588, "kl": 0.8061838716268539, "learning_rate": 6.3897763578274765e-06, "loss": -0.1769, "reward": 0.5369791775941849, "reward_std": 0.281660270690918, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49322917461395266, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 584.3187622070312, "epoch": 0.032325172027524406, "grad_norm": 1.6203663349151611, "kl": 1.0429674439132213, "learning_rate": 6.453674121405751e-06, "loss": -0.2055, "reward": 0.5328125149011612, "reward_std": 0.3219150841236115, "rewards/accuracy_reward": 0.09375000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4390625149011612, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 498.24376525878904, "epoch": 0.03264522323571772, "grad_norm": 15.725761413574219, "kl": 1.3648170441389085, "learning_rate": 6.517571884984026e-06, "loss": -0.264, "reward": 0.387500011920929, "reward_std": 0.2903954029083252, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3458333447575569, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 440.0270965576172, "epoch": 0.03296527444391103, "grad_norm": 23.40546989440918, "kl": 7.020788234472275, "learning_rate": 6.581469648562301e-06, "loss": -0.1625, "reward": 0.2968750104308128, "reward_std": 0.26096881479024886, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2927083417773247, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 376.03334350585936, "epoch": 0.03328532565210433, "grad_norm": 7.267569541931152, "kl": 2.4132164478302003, "learning_rate": 6.645367412140575e-06, "loss": -0.5439, "reward": 0.24791667461395264, "reward_std": 0.21916062086820604, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24791667461395264, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 371.7791778564453, "epoch": 0.033605376860297645, "grad_norm": 2.026956558227539, "kl": 5.643379735946655, "learning_rate": 6.709265175718851e-06, "loss": -0.4394, "reward": 0.31406250447034834, "reward_std": 0.22878252267837523, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.24739584028720857, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 361.2979278564453, "epoch": 0.033925428068490956, "grad_norm": 3.8936071395874023, "kl": 2.7883768916130065, "learning_rate": 6.773162939297126e-06, "loss": -0.7208, "reward": 0.36197917312383654, "reward_std": 0.2445184901356697, "rewards/accuracy_reward": 0.10416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2578125074505806, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 424.5145965576172, "epoch": 0.03424547927668427, "grad_norm": 1.3432917594909668, "kl": 2.7319489240646364, "learning_rate": 6.8370607028754e-06, "loss": -0.5744, "reward": 0.3395833432674408, "reward_std": 0.23924150168895722, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.30625001192092893, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 314.5729248046875, "epoch": 0.03456553048487758, "grad_norm": 17.39250373840332, "kl": 10.756308102607727, "learning_rate": 6.900958466453675e-06, "loss": -0.7653, "reward": 0.2385416731238365, "reward_std": 0.23416123688220977, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2385416731238365, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 363.77084197998045, "epoch": 0.03488558169307089, "grad_norm": 11.659235954284668, "kl": 7.152430748939514, "learning_rate": 6.96485623003195e-06, "loss": -0.5854, "reward": 0.3473958447575569, "reward_std": 0.24123955219984056, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.28072917461395264, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 345.0770950317383, "epoch": 0.0352056329012642, "grad_norm": 1.1932032108306885, "kl": 2.870084857940674, "learning_rate": 7.028753993610224e-06, "loss": -0.825, "reward": 0.40833334177732467, "reward_std": 0.23637133538722993, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.2750000074505806, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 368.9896011352539, "epoch": 0.035525684109457514, "grad_norm": 0.5973522067070007, "kl": 2.3504113078117372, "learning_rate": 7.092651757188499e-06, "loss": -0.7701, "reward": 0.3484375089406967, "reward_std": 0.2674763187766075, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.31093751043081286, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 398.56459350585936, "epoch": 0.035845735317650826, "grad_norm": 0.35262539982795715, "kl": 2.6340174436569215, "learning_rate": 7.156549520766773e-06, "loss": -0.5525, "reward": 0.3354166761040688, "reward_std": 0.2599399000406265, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.29791667610406875, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 406.2562591552734, "epoch": 0.03616578652584414, "grad_norm": 0.3773941397666931, "kl": 2.847642481327057, "learning_rate": 7.220447284345049e-06, "loss": -0.6454, "reward": 0.3380208432674408, "reward_std": 0.2487858936190605, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3338541775941849, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 468.59376525878906, "epoch": 0.03648583773403745, "grad_norm": 1.5241540670394897, "kl": 3.3227088809013368, "learning_rate": 7.284345047923324e-06, "loss": -0.4856, "reward": 0.34270834028720853, "reward_std": 0.23925637304782868, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34270834028720853, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 485.65834655761716, "epoch": 0.036805888942230754, "grad_norm": 0.6140667200088501, "kl": 2.535499429702759, "learning_rate": 7.348242811501598e-06, "loss": -0.5934, "reward": 0.3692708432674408, "reward_std": 0.23712805062532424, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3338541775941849, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 531.3479309082031, "epoch": 0.037125940150424065, "grad_norm": 0.2592978775501251, "kl": 1.8334528475999832, "learning_rate": 7.412140575079873e-06, "loss": -0.4457, "reward": 0.41614584922790526, "reward_std": 0.23759952187538147, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37864584624767306, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 522.435433959961, "epoch": 0.03744599135861738, "grad_norm": 0.2672344148159027, "kl": 2.5485853970050814, "learning_rate": 7.476038338658148e-06, "loss": -0.4729, "reward": 0.4140625178813934, "reward_std": 0.23257143348455428, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33906251192092896, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 564.4583435058594, "epoch": 0.03776604256681069, "grad_norm": 0.169293612241745, "kl": 2.141266053915024, "learning_rate": 7.5399361022364225e-06, "loss": -0.4337, "reward": 0.4895833536982536, "reward_std": 0.20847297906875611, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3562500074505806, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 598.439599609375, "epoch": 0.038086093775004, "grad_norm": 0.16004759073257446, "kl": 2.440905587375164, "learning_rate": 7.603833865814697e-06, "loss": -0.3232, "reward": 0.49895834624767305, "reward_std": 0.2105870932340622, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3947916805744171, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 587.9896118164063, "epoch": 0.03840614498319731, "grad_norm": 0.121612548828125, "kl": 2.967607820034027, "learning_rate": 7.667731629392972e-06, "loss": -0.4322, "reward": 0.38072917610406876, "reward_std": 0.20375476628541947, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3473958417773247, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 525.1312728881836, "epoch": 0.03872619619139062, "grad_norm": 0.45568010210990906, "kl": 3.2019447505474092, "learning_rate": 7.731629392971247e-06, "loss": -0.4594, "reward": 0.43593751788139345, "reward_std": 0.21259481608867645, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3630208447575569, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 606.1395965576172, "epoch": 0.039046247399583935, "grad_norm": 0.22284314036369324, "kl": 2.2424231648445128, "learning_rate": 7.795527156549521e-06, "loss": -0.3353, "reward": 0.41145834028720857, "reward_std": 0.20110711306333542, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.37604167461395266, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 625.7979370117188, "epoch": 0.039366298607777246, "grad_norm": 0.12313688546419144, "kl": 2.1035622477531435, "learning_rate": 7.859424920127796e-06, "loss": -0.3469, "reward": 0.40520834624767305, "reward_std": 0.21936969012022017, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.38437501192092893, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 576.0708465576172, "epoch": 0.03968634981597056, "grad_norm": 1.0849019289016724, "kl": 3.4187208458781244, "learning_rate": 7.92332268370607e-06, "loss": -0.377, "reward": 0.426562511920929, "reward_std": 0.2063383214175701, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3598958432674408, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 539.7833465576172, "epoch": 0.04000640102416387, "grad_norm": 0.17697374522686005, "kl": 3.645887120813131, "learning_rate": 7.987220447284347e-06, "loss": -0.4098, "reward": 0.4916666835546494, "reward_std": 0.1886795900762081, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3916666775941849, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 520.3270965576172, "epoch": 0.040326452232357174, "grad_norm": 0.9790335297584534, "kl": 4.54524188041687, "learning_rate": 8.05111821086262e-06, "loss": -0.5171, "reward": 0.3625000089406967, "reward_std": 0.18890787661075592, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3270833432674408, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 515.5625122070312, "epoch": 0.040646503440550485, "grad_norm": 0.2020310014486313, "kl": 3.900516414642334, "learning_rate": 8.115015974440896e-06, "loss": -0.6225, "reward": 0.31822917312383653, "reward_std": 0.20110346525907516, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3161458417773247, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 540.4583557128906, "epoch": 0.0409665546487438, "grad_norm": 0.6082282662391663, "kl": 3.7118215203285216, "learning_rate": 8.17891373801917e-06, "loss": -0.4666, "reward": 0.3500000089406967, "reward_std": 0.20825719237327575, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.34791667461395265, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 519.7604309082031, "epoch": 0.04128660585693711, "grad_norm": 0.1156620979309082, "kl": 2.5296462953090666, "learning_rate": 8.242811501597445e-06, "loss": -0.4838, "reward": 0.3765625089406967, "reward_std": 0.22583024352788925, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.33906251192092896, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 500.91876525878905, "epoch": 0.04160665706513042, "grad_norm": 0.1654772162437439, "kl": 1.6273529171943664, "learning_rate": 8.30670926517572e-06, "loss": -0.5471, "reward": 0.46093751192092897, "reward_std": 0.2393754631280899, "rewards/accuracy_reward": 0.13958333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.32135417461395266, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 523.1875122070312, "epoch": 0.04192670827332373, "grad_norm": 0.24710217118263245, "kl": 1.1954802095890045, "learning_rate": 8.370607028753994e-06, "loss": -0.4337, "reward": 0.35468751192092896, "reward_std": 0.21932000368833543, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3526041805744171, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 494.2604278564453, "epoch": 0.04224675948151704, "grad_norm": 0.3936014175415039, "kl": 1.0165327221155167, "learning_rate": 8.434504792332269e-06, "loss": -0.4978, "reward": 0.3369791775941849, "reward_std": 0.21726072281599046, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3369791775941849, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 482.7979278564453, "epoch": 0.042566810689710355, "grad_norm": 0.29438725113868713, "kl": 1.0482024848461151, "learning_rate": 8.498402555910544e-06, "loss": -0.4462, "reward": 0.4250000149011612, "reward_std": 0.21294644474983215, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.35416668057441714, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 516.9062683105469, "epoch": 0.042886861897903666, "grad_norm": 0.17954134941101074, "kl": 1.039529764652252, "learning_rate": 8.56230031948882e-06, "loss": -0.3937, "reward": 0.3729166775941849, "reward_std": 0.20056458413600922, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3729166775941849, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 537.2791809082031, "epoch": 0.04320691310609698, "grad_norm": 0.12642943859100342, "kl": 1.100398463010788, "learning_rate": 8.626198083067093e-06, "loss": -0.4328, "reward": 0.38802084922790525, "reward_std": 0.19484265446662902, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3546875149011612, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 545.1250091552735, "epoch": 0.04352696431429029, "grad_norm": 0.10088885575532913, "kl": 1.264848804473877, "learning_rate": 8.690095846645368e-06, "loss": -0.3422, "reward": 0.414583346247673, "reward_std": 0.18097187280654908, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3791666775941849, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 577.5021057128906, "epoch": 0.043847015522483594, "grad_norm": 0.14205513894557953, "kl": 1.0939792722463608, "learning_rate": 8.753993610223644e-06, "loss": -0.2693, "reward": 0.42968751192092897, "reward_std": 0.16178201138973236, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.39427084624767306, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 641.9916748046875, "epoch": 0.044167066730676906, "grad_norm": 0.0982375368475914, "kl": 0.7304096844047308, "learning_rate": 8.817891373801917e-06, "loss": -0.1667, "reward": 0.40989584624767306, "reward_std": 0.1424245983362198, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40989584624767306, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 617.9458435058593, "epoch": 0.04448711793887022, "grad_norm": 0.13668540120124817, "kl": 1.4703953325748444, "learning_rate": 8.881789137380193e-06, "loss": -0.1482, "reward": 0.4848958492279053, "reward_std": 0.1417014442384243, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41614584624767303, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 696.9750122070312, "epoch": 0.04480716914706353, "grad_norm": 1.111905574798584, "kl": 0.7676813244819641, "learning_rate": 8.945686900958466e-06, "loss": -0.0908, "reward": 0.4343750149011612, "reward_std": 0.1505623020231724, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42395834624767303, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 630.5812744140625, "epoch": 0.04512722035525684, "grad_norm": 0.17764167487621307, "kl": 0.6737473249435425, "learning_rate": 9.009584664536743e-06, "loss": -0.1169, "reward": 0.42968750894069674, "reward_std": 0.1200267419219017, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42968750894069674, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 632.0166809082032, "epoch": 0.04544727156345015, "grad_norm": 0.0950830951333046, "kl": 1.0807377099990845, "learning_rate": 9.073482428115017e-06, "loss": -0.1856, "reward": 0.4817708432674408, "reward_std": 0.15390508249402046, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41093750596046447, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 589.5645965576172, "epoch": 0.045767322771643464, "grad_norm": 0.1526706963777542, "kl": 0.7299367796629668, "learning_rate": 9.137380191693292e-06, "loss": -0.1688, "reward": 0.5208333522081375, "reward_std": 0.14540843814611434, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.418750011920929, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 627.7479370117187, "epoch": 0.046087373979836775, "grad_norm": 0.15692493319511414, "kl": 0.7879636850208044, "learning_rate": 9.201277955271566e-06, "loss": -0.1618, "reward": 0.4661458432674408, "reward_std": 0.12382942289113999, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43281251192092896, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 651.795849609375, "epoch": 0.04640742518803009, "grad_norm": 0.07808000594377518, "kl": 0.7620222073048353, "learning_rate": 9.265175718849841e-06, "loss": -0.1348, "reward": 0.5567708402872086, "reward_std": 0.11367295645177364, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45260417759418486, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 640.8125122070312, "epoch": 0.0467274763962234, "grad_norm": 0.07719507813453674, "kl": 0.6032818179577589, "learning_rate": 9.329073482428116e-06, "loss": -0.0882, "reward": 0.5203125149011611, "reward_std": 0.10005458071827888, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45364584028720856, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 649.1833465576171, "epoch": 0.04704752760441671, "grad_norm": 0.06736160814762115, "kl": 0.4908189844340086, "learning_rate": 9.39297124600639e-06, "loss": -0.0721, "reward": 0.5036458462476731, "reward_std": 0.1022965095937252, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4661458432674408, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 696.7562683105468, "epoch": 0.047367578812610014, "grad_norm": 0.06782618910074234, "kl": 0.44623993411660196, "learning_rate": 9.456869009584665e-06, "loss": -0.0432, "reward": 0.4822916746139526, "reward_std": 0.09638992436230183, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4802083432674408, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 636.6437683105469, "epoch": 0.047687630020803326, "grad_norm": 0.06565655022859573, "kl": 1.2311117429286242, "learning_rate": 9.52076677316294e-06, "loss": -0.0837, "reward": 0.6432291865348816, "reward_std": 0.10860510841012001, "rewards/accuracy_reward": 0.1708333384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47239584624767306, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 673.8854370117188, "epoch": 0.04800768122899664, "grad_norm": 0.05761784315109253, "kl": 0.3750141691416502, "learning_rate": 9.584664536741216e-06, "loss": -0.0318, "reward": 0.5354166865348816, "reward_std": 0.12210818231105805, "rewards/accuracy_reward": 0.05000000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4854166805744171, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 683.5250122070313, "epoch": 0.04832773243718995, "grad_norm": 0.06147807464003563, "kl": 0.48957694321870804, "learning_rate": 9.64856230031949e-06, "loss": -0.0461, "reward": 0.518750011920929, "reward_std": 0.09728633239865303, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4833333432674408, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 687.3541809082031, "epoch": 0.04864778364538326, "grad_norm": 0.05679427832365036, "kl": 0.7635953679680825, "learning_rate": 9.712460063897765e-06, "loss": -0.0577, "reward": 0.5260416865348816, "reward_std": 0.09843504205346107, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4927083492279053, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 709.8979431152344, "epoch": 0.04896783485357657, "grad_norm": 0.061386361718177795, "kl": 0.34264843370765447, "learning_rate": 9.77635782747604e-06, "loss": -0.0276, "reward": 0.5447916924953461, "reward_std": 0.12330903708934784, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49687502086162566, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 653.8687683105469, "epoch": 0.049287886061769884, "grad_norm": 0.06698963791131973, "kl": 0.8141861855983734, "learning_rate": 9.840255591054313e-06, "loss": -0.06, "reward": 0.5182291865348816, "reward_std": 0.12519886679947376, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5140625238418579, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 620.10419921875, "epoch": 0.049607937269963195, "grad_norm": 0.0670495480298996, "kl": 1.0433136209845544, "learning_rate": 9.904153354632589e-06, "loss": -0.0964, "reward": 0.5984375208616257, "reward_std": 0.15226368308067323, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.525520846247673, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 684.5625244140625, "epoch": 0.04992798847815651, "grad_norm": 0.06608369201421738, "kl": 0.8835949804633856, "learning_rate": 9.968051118210862e-06, "loss": -0.0806, "reward": 0.6583333492279053, "reward_std": 0.17013006806373596, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5500000238418579, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 745.2604431152344, "epoch": 0.05024803968634982, "grad_norm": 0.06933408230543137, "kl": 0.14976065829396248, "learning_rate": 1.0031948881789138e-05, "loss": -0.0559, "reward": 0.5828125238418579, "reward_std": 0.1708540216088295, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.580729192495346, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 728.7020935058594, "epoch": 0.05056809089454313, "grad_norm": 0.06534843891859055, "kl": 0.4937169037759304, "learning_rate": 1.0095846645367413e-05, "loss": -0.0882, "reward": 0.6229166924953461, "reward_std": 0.17530024647712708, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6229166924953461, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 734.3208557128906, "epoch": 0.050888142102736435, "grad_norm": 0.062214288860559464, "kl": 0.5076489731669426, "learning_rate": 1.0159744408945688e-05, "loss": -0.0606, "reward": 0.7739583492279053, "reward_std": 0.2034250505268574, "rewards/accuracy_reward": 0.11875000447034836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6552083551883697, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 692.9541870117188, "epoch": 0.051208193310929746, "grad_norm": 0.0641920194029808, "kl": 0.3405150633305311, "learning_rate": 1.0223642172523962e-05, "loss": -0.0657, "reward": 0.7109375178813935, "reward_std": 0.16137611567974092, "rewards/accuracy_reward": 0.014583333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6963541865348816, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 705.8896057128907, "epoch": 0.05152824451912306, "grad_norm": 0.057186439633369446, "kl": 0.3750379033386707, "learning_rate": 1.0287539936102237e-05, "loss": -0.0887, "reward": 0.7437500178813934, "reward_std": 0.17544491738080978, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7000000298023223, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 787.7479309082031, "epoch": 0.05184829572731637, "grad_norm": 0.053737491369247437, "kl": 0.26711506862193346, "learning_rate": 1.035143769968051e-05, "loss": -0.044, "reward": 0.7645833492279053, "reward_std": 0.17004137337207795, "rewards/accuracy_reward": 0.03125000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7333333551883697, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 795.1354370117188, "epoch": 0.05216834693550968, "grad_norm": 0.04419364780187607, "kl": 0.17381047271192074, "learning_rate": 1.0415335463258786e-05, "loss": -0.0529, "reward": 0.7645833551883697, "reward_std": 0.14466918781399726, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7270833551883698, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 750.2250183105468, "epoch": 0.05248839814370299, "grad_norm": 0.04193587601184845, "kl": 0.20865581147372722, "learning_rate": 1.0479233226837063e-05, "loss": -0.0779, "reward": 0.7322916805744171, "reward_std": 0.15352466367185116, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583551883697, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 752.6562683105469, "epoch": 0.052808449351896304, "grad_norm": 0.04201805219054222, "kl": 0.08975614961236715, "learning_rate": 1.0543130990415335e-05, "loss": -0.0339, "reward": 0.9260416865348816, "reward_std": 0.12468962892889976, "rewards/accuracy_reward": 0.1937500059604645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7322916805744171, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 702.752099609375, "epoch": 0.053128500560089616, "grad_norm": 0.04461502656340599, "kl": 0.30946322418749334, "learning_rate": 1.0607028753993612e-05, "loss": -0.0498, "reward": 0.8140625178813934, "reward_std": 0.12668757885694504, "rewards/accuracy_reward": 0.07916666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7348958551883698, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 741.483349609375, "epoch": 0.05344855176828293, "grad_norm": 0.04805014282464981, "kl": 0.2729524029418826, "learning_rate": 1.0670926517571887e-05, "loss": -0.0861, "reward": 0.7713541984558105, "reward_std": 0.1388819508254528, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7317708551883697, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 750.4937622070313, "epoch": 0.05376860297647624, "grad_norm": 0.04891999065876007, "kl": 0.23906942158937455, "learning_rate": 1.073482428115016e-05, "loss": -0.0397, "reward": 0.8406250178813934, "reward_std": 0.1587209053337574, "rewards/accuracy_reward": 0.11666666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7239583432674408, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 747.183349609375, "epoch": 0.05408865418466955, "grad_norm": 0.061414770781993866, "kl": 0.12371877171099185, "learning_rate": 1.0798722044728436e-05, "loss": -0.0639, "reward": 0.7963541865348815, "reward_std": 0.13769610971212387, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7442708492279053, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 763.4146057128906, "epoch": 0.054408705392862855, "grad_norm": 0.04897288233041763, "kl": 0.14503006264567375, "learning_rate": 1.086261980830671e-05, "loss": -0.072, "reward": 0.8734375298023224, "reward_std": 0.12419936545193196, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7484375178813935, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 773.9437805175781, "epoch": 0.05472875660105617, "grad_norm": 0.04901808127760887, "kl": 0.27871253080666064, "learning_rate": 1.0926517571884985e-05, "loss": -0.0725, "reward": 0.8041666865348815, "reward_std": 0.21438361182808877, "rewards/accuracy_reward": 0.08125000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7229166865348816, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 752.4146057128906, "epoch": 0.05504880780924948, "grad_norm": 0.048184193670749664, "kl": 0.14338683970272542, "learning_rate": 1.099041533546326e-05, "loss": -0.0312, "reward": 0.7796875238418579, "reward_std": 0.1453725527971983, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7338541865348815, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 808.0916931152344, "epoch": 0.05536885901744279, "grad_norm": 0.05158894136548042, "kl": 0.14129090048372744, "learning_rate": 1.1054313099041534e-05, "loss": -0.0349, "reward": 0.8651041924953461, "reward_std": 0.20059835761785508, "rewards/accuracy_reward": 0.11875000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7463541805744172, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 743.1812744140625, "epoch": 0.0556889102256361, "grad_norm": 0.052500639110803604, "kl": 0.19789513647556306, "learning_rate": 1.1118210862619809e-05, "loss": -0.0694, "reward": 0.8750000298023224, "reward_std": 0.14784687310457229, "rewards/accuracy_reward": 0.1458333373069763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.729166692495346, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 788.4729309082031, "epoch": 0.05600896143382941, "grad_norm": 0.045249536633491516, "kl": 0.14523118771612645, "learning_rate": 1.1182108626198084e-05, "loss": -0.0787, "reward": 0.8843750357627869, "reward_std": 0.21863970458507537, "rewards/accuracy_reward": 0.15625000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.728125023841858, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 791.3062683105469, "epoch": 0.056329012642022724, "grad_norm": 0.04602384939789772, "kl": 0.11468939054757357, "learning_rate": 1.1246006389776358e-05, "loss": -0.0463, "reward": 0.8541666984558105, "reward_std": 0.15563478991389273, "rewards/accuracy_reward": 0.1229166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7312500178813934, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 739.3500244140625, "epoch": 0.056649063850216036, "grad_norm": 0.04179657995700836, "kl": 0.07534434907138347, "learning_rate": 1.1309904153354633e-05, "loss": -0.044, "reward": 0.783854192495346, "reward_std": 0.12365256864577531, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7401041865348816, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 802.0562683105469, "epoch": 0.05696911505840935, "grad_norm": 0.04096505045890808, "kl": 0.08589836191385984, "learning_rate": 1.1373801916932907e-05, "loss": -0.0478, "reward": 0.7515625178813934, "reward_std": 0.11650533098727464, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7515625178813934, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 793.402099609375, "epoch": 0.05728916626660266, "grad_norm": 0.04286341741681099, "kl": 0.0660052813589573, "learning_rate": 1.1437699680511182e-05, "loss": -0.0462, "reward": 0.8166666924953461, "reward_std": 0.1813099652528763, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7479166865348816, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 748.2604370117188, "epoch": 0.05760921747479597, "grad_norm": 0.052840933203697205, "kl": 0.2744094289839268, "learning_rate": 1.1501597444089459e-05, "loss": -0.0967, "reward": 0.8364583730697632, "reward_std": 0.19567819759249688, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7260416984558106, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 802.1250183105469, "epoch": 0.057929268682989275, "grad_norm": 0.05346866324543953, "kl": 0.10278765726834535, "learning_rate": 1.1565495207667731e-05, "loss": -0.0678, "reward": 0.821354192495346, "reward_std": 0.15091807544231414, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7546875238418579, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 777.9396057128906, "epoch": 0.05824931989118259, "grad_norm": 0.0543404147028923, "kl": 0.38175057210028174, "learning_rate": 1.1629392971246008e-05, "loss": -0.0913, "reward": 0.7859375238418579, "reward_std": 0.22937640249729158, "rewards/accuracy_reward": 0.04375000055879354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7421875178813935, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 792.1708557128907, "epoch": 0.0585693710993759, "grad_norm": 0.047496624290943146, "kl": 0.1264643581584096, "learning_rate": 1.1693290734824283e-05, "loss": -0.0565, "reward": 0.830729192495346, "reward_std": 0.19259970933198928, "rewards/accuracy_reward": 0.08750000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.743229192495346, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 837.839599609375, "epoch": 0.05888942230756921, "grad_norm": 0.09101786464452744, "kl": 0.04611029475927353, "learning_rate": 1.1757188498402557e-05, "loss": -0.0188, "reward": 0.791666692495346, "reward_std": 0.17144267708063127, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.775000023841858, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 806.0145935058594, "epoch": 0.05920947351576252, "grad_norm": 0.051978375762701035, "kl": 0.13175021912902593, "learning_rate": 1.1821086261980832e-05, "loss": -0.0792, "reward": 0.8328125178813934, "reward_std": 0.21297012269496918, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7578125178813935, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 832.8146057128906, "epoch": 0.05952952472395583, "grad_norm": 0.05038255825638771, "kl": 0.10234151016920805, "learning_rate": 1.1884984025559106e-05, "loss": -0.0399, "reward": 0.7869791865348816, "reward_std": 0.22053608372807504, "rewards/accuracy_reward": 0.03541666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.751562523841858, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 793.3583557128907, "epoch": 0.059849575932149145, "grad_norm": 0.05190233141183853, "kl": 0.1916276691481471, "learning_rate": 1.1948881789137381e-05, "loss": -0.0471, "reward": 0.8348958373069764, "reward_std": 0.17255963943898678, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7661458432674408, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 832.2937744140625, "epoch": 0.060169627140342456, "grad_norm": 0.048310406506061554, "kl": 0.3118143383413553, "learning_rate": 1.2012779552715656e-05, "loss": -0.0564, "reward": 0.8093750238418579, "reward_std": 0.19749893844127656, "rewards/accuracy_reward": 0.05208333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7572916865348815, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 855.8937622070313, "epoch": 0.06048967834853577, "grad_norm": 0.041950833052396774, "kl": 0.13190485239028932, "learning_rate": 1.207667731629393e-05, "loss": -0.0476, "reward": 0.8265625238418579, "reward_std": 0.16573217641562224, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7890625178813935, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 851.1979431152344, "epoch": 0.06080972955672908, "grad_norm": 0.05058378353714943, "kl": 0.4103548087179661, "learning_rate": 1.2140575079872205e-05, "loss": -0.0665, "reward": 0.892187523841858, "reward_std": 0.2683277949690819, "rewards/accuracy_reward": 0.12500000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7671875178813934, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 826.3041931152344, "epoch": 0.06112978076492239, "grad_norm": 0.06598315387964249, "kl": 0.1979449477046728, "learning_rate": 1.220447284345048e-05, "loss": -0.065, "reward": 0.8723958492279053, "reward_std": 0.16352599412202834, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7911458492279053, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 879.0062683105468, "epoch": 0.061449831973115696, "grad_norm": 0.04891607537865639, "kl": 0.19413473419845104, "learning_rate": 1.2268370607028754e-05, "loss": -0.0562, "reward": 0.8197916805744171, "reward_std": 0.18419070467352866, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8114583492279053, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 817.7750122070313, "epoch": 0.06176988318130901, "grad_norm": 0.04997456446290016, "kl": 0.21307219099253416, "learning_rate": 1.233226837060703e-05, "loss": -0.0385, "reward": 0.8276041865348815, "reward_std": 0.1461735963821411, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7859375178813934, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 854.0583435058594, "epoch": 0.06208993438950232, "grad_norm": 0.048078566789627075, "kl": 0.18797751162201165, "learning_rate": 1.2396166134185303e-05, "loss": -0.0481, "reward": 0.8651041865348816, "reward_std": 0.1707301080226898, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8109375178813935, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 827.9250244140625, "epoch": 0.06240998559769563, "grad_norm": 0.0677054226398468, "kl": 0.24947662875056267, "learning_rate": 1.2460063897763578e-05, "loss": -0.0748, "reward": 0.852604192495346, "reward_std": 0.16003775596618652, "rewards/accuracy_reward": 0.039583335444331166, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8130208492279053, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 877.6354370117188, "epoch": 0.06273003680588894, "grad_norm": 0.04754648357629776, "kl": 0.13086143620312213, "learning_rate": 1.2523961661341855e-05, "loss": -0.0492, "reward": 0.8312500298023224, "reward_std": 0.181324028596282, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7916666865348816, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 874.577099609375, "epoch": 0.06305008801408225, "grad_norm": 0.060636699199676514, "kl": 0.8544145112857222, "learning_rate": 1.2587859424920127e-05, "loss": -0.0723, "reward": 0.9250000298023224, "reward_std": 0.1732124462723732, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8229166805744171, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 895.7646057128907, "epoch": 0.06337013922227556, "grad_norm": 0.04687461629509926, "kl": 0.20718304738402366, "learning_rate": 1.2651757188498404e-05, "loss": -0.048, "reward": 0.8713541865348816, "reward_std": 0.22062713280320168, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8192708432674408, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 830.4750183105468, "epoch": 0.06369019043046888, "grad_norm": 0.0773216262459755, "kl": 0.8122439078986645, "learning_rate": 1.271565495207668e-05, "loss": -0.1072, "reward": 0.8343750298023224, "reward_std": 0.2193293772637844, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7718750238418579, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 874.8125244140625, "epoch": 0.06401024163866219, "grad_norm": 0.09615940600633621, "kl": 0.4952188327908516, "learning_rate": 1.2779552715654953e-05, "loss": -0.0731, "reward": 0.9630208551883698, "reward_std": 0.2012898415327072, "rewards/accuracy_reward": 0.1562500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8067708432674408, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 919.6937622070312, "epoch": 0.0643302928468555, "grad_norm": 0.055171411484479904, "kl": 0.42661799900233743, "learning_rate": 1.2843450479233228e-05, "loss": -0.0889, "reward": 0.8906250298023224, "reward_std": 0.1967291221022606, "rewards/accuracy_reward": 0.041666668653488156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8489583551883697, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 908.6062683105469, "epoch": 0.06465034405504881, "grad_norm": 0.04904462397098541, "kl": 0.19709489084780216, "learning_rate": 1.2907348242811502e-05, "loss": -0.0544, "reward": 0.9531250357627868, "reward_std": 0.18670724853873252, "rewards/accuracy_reward": 0.1104166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8427083492279053, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 893.7104431152344, "epoch": 0.06497039526324212, "grad_norm": 0.0566297248005867, "kl": 0.6682713240385055, "learning_rate": 1.2971246006389777e-05, "loss": -0.0859, "reward": 0.9708333551883698, "reward_std": 0.2716028355062008, "rewards/accuracy_reward": 0.1562500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8145833551883698, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 913.8271057128907, "epoch": 0.06529044647143543, "grad_norm": 0.04946311190724373, "kl": 0.3586285777390003, "learning_rate": 1.3035143769968053e-05, "loss": -0.0774, "reward": 0.8635416984558105, "reward_std": 0.2157668873667717, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8177083551883697, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 955.3604370117188, "epoch": 0.06561049767962875, "grad_norm": 0.06902287155389786, "kl": 1.093763279914856, "learning_rate": 1.3099041533546326e-05, "loss": -0.0612, "reward": 0.8843750178813934, "reward_std": 0.1965337350964546, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8510416865348815, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 931.152099609375, "epoch": 0.06593054888782206, "grad_norm": 0.13752013444900513, "kl": 1.419936703145504, "learning_rate": 1.3162939297124601e-05, "loss": -0.0831, "reward": 0.9291666805744171, "reward_std": 0.21198599338531493, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8458333492279053, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 898.8145935058594, "epoch": 0.06625060009601537, "grad_norm": 0.05612272769212723, "kl": 0.6546476993709802, "learning_rate": 1.3226837060702877e-05, "loss": -0.0815, "reward": 0.8734375238418579, "reward_std": 0.21769996285438536, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8171875178813934, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 910.4833618164063, "epoch": 0.06657065130420867, "grad_norm": 0.05039665102958679, "kl": 0.7447992540895939, "learning_rate": 1.329073482428115e-05, "loss": -0.0785, "reward": 0.857812511920929, "reward_std": 0.17638538777828217, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8244791805744172, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 942.6750244140625, "epoch": 0.06689070251240198, "grad_norm": 0.055274222046136856, "kl": 0.6583600550889969, "learning_rate": 1.3354632587859426e-05, "loss": -0.0736, "reward": 1.0552083730697632, "reward_std": 0.1857333317399025, "rewards/accuracy_reward": 0.18750000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8677083611488342, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 939.3333557128906, "epoch": 0.06721075372059529, "grad_norm": 1.966977834701538, "kl": 0.29377989545464517, "learning_rate": 1.3418530351437703e-05, "loss": -0.0634, "reward": 0.9557292103767395, "reward_std": 0.24651620537042618, "rewards/accuracy_reward": 0.11041666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.845312523841858, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 946.7916809082031, "epoch": 0.0675308049287886, "grad_norm": 0.04794445261359215, "kl": 0.19607984125614167, "learning_rate": 1.3482428115015975e-05, "loss": -0.0447, "reward": 0.8927083551883698, "reward_std": 0.2090878263115883, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8468750238418579, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 925.9646057128906, "epoch": 0.06785085613698191, "grad_norm": 0.05501377210021019, "kl": 0.4783424001187086, "learning_rate": 1.3546325878594251e-05, "loss": -0.0918, "reward": 0.9140625119209289, "reward_std": 0.2742405004799366, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8286458373069763, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 910.5520935058594, "epoch": 0.06817090734517522, "grad_norm": 0.06620791554450989, "kl": 0.6241559140384197, "learning_rate": 1.3610223642172523e-05, "loss": -0.085, "reward": 0.8677083492279053, "reward_std": 0.22876578718423843, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8281250238418579, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 917.358349609375, "epoch": 0.06849095855336854, "grad_norm": 0.08500348031520844, "kl": 0.6475715897977352, "learning_rate": 1.36741214057508e-05, "loss": -0.0907, "reward": 0.8208333492279053, "reward_std": 0.24260879009962083, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8187500178813935, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 930.9750244140625, "epoch": 0.06881100976156185, "grad_norm": 0.06262990832328796, "kl": 0.3204282820224762, "learning_rate": 1.3738019169329076e-05, "loss": -0.0806, "reward": 0.8671875178813935, "reward_std": 0.228495317697525, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.815104192495346, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 871.8562683105469, "epoch": 0.06913106096975516, "grad_norm": 0.0846937745809555, "kl": 0.5917581547051668, "learning_rate": 1.380191693290735e-05, "loss": -0.0988, "reward": 0.8718750178813934, "reward_std": 0.2557776391506195, "rewards/accuracy_reward": 0.08541666846722365, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7864583432674408, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 877.3812744140625, "epoch": 0.06945111217794847, "grad_norm": 0.1118309274315834, "kl": 0.9335677590221166, "learning_rate": 1.3865814696485625e-05, "loss": -0.1107, "reward": 0.9578125298023223, "reward_std": 0.20639725401997566, "rewards/accuracy_reward": 0.12708333656191825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8307291805744171, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 907.5916870117187, "epoch": 0.06977116338614178, "grad_norm": 0.14517973363399506, "kl": 0.8811855886131525, "learning_rate": 1.39297124600639e-05, "loss": -0.0957, "reward": 0.901041692495346, "reward_std": 0.2358024850487709, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8281250119209289, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 918.8687622070313, "epoch": 0.0700912145943351, "grad_norm": 0.056523486971855164, "kl": 0.31053061634302137, "learning_rate": 1.3993610223642173e-05, "loss": -0.0742, "reward": 0.9885416984558105, "reward_std": 0.24320609569549562, "rewards/accuracy_reward": 0.1562500052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8322916865348816, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 921.8979370117188, "epoch": 0.0704112658025284, "grad_norm": 0.07962542772293091, "kl": 0.40545434355735777, "learning_rate": 1.4057507987220449e-05, "loss": -0.0853, "reward": 0.9197916865348816, "reward_std": 0.24055335223674773, "rewards/accuracy_reward": 0.09166666828095912, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8281250238418579, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 905.5479370117188, "epoch": 0.07073131701072172, "grad_norm": 0.06706108152866364, "kl": 0.6539999444037676, "learning_rate": 1.4121405750798722e-05, "loss": -0.1003, "reward": 0.9619791984558106, "reward_std": 0.2473888464272022, "rewards/accuracy_reward": 0.13333333600312472, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8286458551883698, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 864.2271057128906, "epoch": 0.07105136821891503, "grad_norm": 0.055032406002283096, "kl": 0.45531212612986566, "learning_rate": 1.4185303514376998e-05, "loss": -0.0982, "reward": 0.8796875298023223, "reward_std": 0.19982553869485856, "rewards/accuracy_reward": 0.07083333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8088541865348816, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 942.4041870117187, "epoch": 0.07137141942710834, "grad_norm": 0.054181210696697235, "kl": 0.19609659500420093, "learning_rate": 1.4249201277955273e-05, "loss": -0.0762, "reward": 0.9645833611488343, "reward_std": 0.2233875073492527, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8437500238418579, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 915.5021118164062, "epoch": 0.07169147063530165, "grad_norm": 0.1139911636710167, "kl": 0.36906580030918124, "learning_rate": 1.4313099041533547e-05, "loss": -0.0601, "reward": 0.8807291865348816, "reward_std": 0.19108127057552338, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8390625238418579, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 934.5166931152344, "epoch": 0.07201152184349496, "grad_norm": 0.05806288123130798, "kl": 0.6494541350752115, "learning_rate": 1.4376996805111822e-05, "loss": -0.0754, "reward": 0.9494791924953461, "reward_std": 0.24038127958774566, "rewards/accuracy_reward": 0.10416666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.845312523841858, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 939.6083557128907, "epoch": 0.07233157305168827, "grad_norm": 0.09360338002443314, "kl": 0.5438719809055328, "learning_rate": 1.4440894568690099e-05, "loss": -0.0858, "reward": 0.9640625298023224, "reward_std": 0.24050120264291763, "rewards/accuracy_reward": 0.12083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8432291865348815, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 961.1437744140625, "epoch": 0.07265162425988159, "grad_norm": 0.06449954211711884, "kl": 0.4873860139399767, "learning_rate": 1.450479233226837e-05, "loss": -0.0587, "reward": 0.8567708492279053, "reward_std": 0.2645621284842491, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8380208492279053, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 924.8708557128906, "epoch": 0.0729716754680749, "grad_norm": 0.22151648998260498, "kl": 2.0567788064479826, "learning_rate": 1.4568690095846648e-05, "loss": -0.1014, "reward": 0.9880208551883698, "reward_std": 0.22209313362836838, "rewards/accuracy_reward": 0.13750000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8505208492279053, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 937.3479431152343, "epoch": 0.07329172667626821, "grad_norm": 0.07860510051250458, "kl": 0.3857763078063726, "learning_rate": 1.4632587859424921e-05, "loss": -0.081, "reward": 0.9505208611488343, "reward_std": 0.21638043001294135, "rewards/accuracy_reward": 0.09791666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8526041805744171, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 954.614599609375, "epoch": 0.07361177788446151, "grad_norm": 0.14230084419250488, "kl": 0.5276958528906107, "learning_rate": 1.4696485623003197e-05, "loss": -0.0949, "reward": 0.9317708492279053, "reward_std": 0.275539268553257, "rewards/accuracy_reward": 0.09166667181998492, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8401041865348816, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 961.7937683105469, "epoch": 0.07393182909265482, "grad_norm": 0.0633036196231842, "kl": 0.43820536993443965, "learning_rate": 1.4760383386581472e-05, "loss": -0.0817, "reward": 0.9218750238418579, "reward_std": 0.26296704858541486, "rewards/accuracy_reward": 0.07916666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8427083551883697, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 965.0666870117187, "epoch": 0.07425188030084813, "grad_norm": 0.12930835783481598, "kl": 0.4649226266890764, "learning_rate": 1.4824281150159745e-05, "loss": -0.0555, "reward": 0.8692708551883698, "reward_std": 0.25685995519161225, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8276041865348815, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 972.4396118164062, "epoch": 0.07457193150904144, "grad_norm": 0.09969345480203629, "kl": 0.5674844704568386, "learning_rate": 1.488817891373802e-05, "loss": -0.0716, "reward": 0.9229166984558106, "reward_std": 0.2375131815671921, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8729166924953461, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 942.058349609375, "epoch": 0.07489198271723475, "grad_norm": 0.06822368502616882, "kl": 0.7561644535511732, "learning_rate": 1.4952076677316296e-05, "loss": -0.1003, "reward": 0.8223958551883698, "reward_std": 0.2654576122760773, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8161458551883698, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 989.1646057128906, "epoch": 0.07521203392542807, "grad_norm": 0.3903352916240692, "kl": 0.5844904962927103, "learning_rate": 1.501597444089457e-05, "loss": -0.048, "reward": 0.9182291746139526, "reward_std": 0.23523074388504028, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8765625119209289, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 965.189599609375, "epoch": 0.07553208513362138, "grad_norm": 0.057704098522663116, "kl": 0.7968939036130905, "learning_rate": 1.5079872204472845e-05, "loss": -0.0777, "reward": 0.9833333611488342, "reward_std": 0.23558287769556047, "rewards/accuracy_reward": 0.10208333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8812500238418579, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 961.7687622070313, "epoch": 0.07585213634181469, "grad_norm": 0.05349546670913696, "kl": 0.2956059377640486, "learning_rate": 1.5143769968051119e-05, "loss": -0.0692, "reward": 0.8864583551883698, "reward_std": 0.22086520940065385, "rewards/accuracy_reward": 0.014583333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8718750178813934, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 968.858349609375, "epoch": 0.076172187550008, "grad_norm": 0.04913521930575371, "kl": 0.1724690929055214, "learning_rate": 1.5207667731629394e-05, "loss": -0.0687, "reward": 0.9208333551883697, "reward_std": 0.19898689687252044, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8812500238418579, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 932.9541870117188, "epoch": 0.07649223875820131, "grad_norm": 0.12697535753250122, "kl": 0.1935725949704647, "learning_rate": 1.527156549520767e-05, "loss": -0.0749, "reward": 0.8927083492279053, "reward_std": 0.21466821134090425, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8552083492279052, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 945.7208557128906, "epoch": 0.07681228996639462, "grad_norm": 0.0980035737156868, "kl": 0.21331431940197945, "learning_rate": 1.5335463258785944e-05, "loss": -0.081, "reward": 0.9015625178813934, "reward_std": 0.21541929244995117, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8723958551883697, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 889.0000244140625, "epoch": 0.07713234117458793, "grad_norm": 0.08833970129489899, "kl": 0.39752455055713654, "learning_rate": 1.5399361022364218e-05, "loss": -0.1273, "reward": 0.9666666865348816, "reward_std": 0.196412655711174, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8333333432674408, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 915.3437805175781, "epoch": 0.07745239238278125, "grad_norm": 0.11988092958927155, "kl": 0.5734913632273674, "learning_rate": 1.5463258785942495e-05, "loss": -0.1226, "reward": 0.8947916984558105, "reward_std": 0.24440784752368927, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8406250298023223, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 934.3604370117188, "epoch": 0.07777244359097456, "grad_norm": 0.16344939172267914, "kl": 1.3431407153606414, "learning_rate": 1.552715654952077e-05, "loss": -0.0986, "reward": 0.8593750298023224, "reward_std": 0.20968187004327773, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8572916924953461, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 929.958349609375, "epoch": 0.07809249479916787, "grad_norm": 0.05979590862989426, "kl": 0.7483044236898422, "learning_rate": 1.5591054313099042e-05, "loss": -0.1062, "reward": 0.9828125238418579, "reward_std": 0.26393072605133056, "rewards/accuracy_reward": 0.1229166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8598958492279053, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 940.4229370117188, "epoch": 0.07841254600736118, "grad_norm": 0.05631721392273903, "kl": 0.6490201197564602, "learning_rate": 1.5654952076677316e-05, "loss": -0.1067, "reward": 0.9223958551883698, "reward_std": 0.20847276747226715, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8807291805744171, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 950.0166931152344, "epoch": 0.07873259721555449, "grad_norm": 0.09853655844926834, "kl": 0.5052209571003914, "learning_rate": 1.5718849840255593e-05, "loss": -0.0544, "reward": 0.9916666984558106, "reward_std": 0.23812063187360763, "rewards/accuracy_reward": 0.12083333935588599, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8708333611488343, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 919.2541809082031, "epoch": 0.0790526484237478, "grad_norm": 0.12566211819648743, "kl": 1.113349625095725, "learning_rate": 1.5782747603833866e-05, "loss": -0.1065, "reward": 0.8838541865348816, "reward_std": 0.23888127356767655, "rewards/accuracy_reward": 0.02916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8546875178813934, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 945.0062683105468, "epoch": 0.07937269963194112, "grad_norm": 0.11134926974773407, "kl": 1.0100045025348663, "learning_rate": 1.584664536741214e-05, "loss": -0.0955, "reward": 0.9098958492279052, "reward_std": 0.21706083416938782, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8932291865348816, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 928.4041870117187, "epoch": 0.07969275084013443, "grad_norm": 0.09585289657115936, "kl": 1.7022089518606662, "learning_rate": 1.5910543130990417e-05, "loss": -0.1081, "reward": 0.9406250178813934, "reward_std": 0.23443643376231194, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.865625011920929, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 953.2375244140625, "epoch": 0.08001280204832774, "grad_norm": 0.08478322625160217, "kl": 1.1752378184348344, "learning_rate": 1.5974440894568694e-05, "loss": -0.0952, "reward": 0.9005208671092987, "reward_std": 0.23451116681098938, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8880208551883697, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 911.4916809082031, "epoch": 0.08033285325652105, "grad_norm": 0.17704196274280548, "kl": 1.9864186983555556, "learning_rate": 1.6038338658146964e-05, "loss": -0.146, "reward": 1.012500023841858, "reward_std": 0.2728649765253067, "rewards/accuracy_reward": 0.1479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8645833492279053, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 941.0708557128906, "epoch": 0.08065290446471435, "grad_norm": 0.13199065625667572, "kl": 1.3647517763078212, "learning_rate": 1.610223642172524e-05, "loss": -0.1164, "reward": 0.9848958551883698, "reward_std": 0.27456178665161135, "rewards/accuracy_reward": 0.12291667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.861979192495346, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 954.4291870117188, "epoch": 0.08097295567290766, "grad_norm": 0.05368947610259056, "kl": 0.8016906466335059, "learning_rate": 1.6166134185303515e-05, "loss": -0.0952, "reward": 0.9369791805744171, "reward_std": 0.2690433174371719, "rewards/accuracy_reward": 0.06250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8744791865348815, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 938.6520935058594, "epoch": 0.08129300688110097, "grad_norm": 0.11912751197814941, "kl": 1.1840459078550338, "learning_rate": 1.623003194888179e-05, "loss": -0.0976, "reward": 1.030729204416275, "reward_std": 0.32827322483062743, "rewards/accuracy_reward": 0.15000000335276126, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8807291865348816, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 961.820849609375, "epoch": 0.08161305808929428, "grad_norm": 0.06231944262981415, "kl": 1.172089009732008, "learning_rate": 1.6293929712460065e-05, "loss": -0.0999, "reward": 0.8864583492279052, "reward_std": 0.24923737421631814, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8760416865348816, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 958.8812744140625, "epoch": 0.0819331092974876, "grad_norm": 0.06999265402555466, "kl": 0.6742954470217228, "learning_rate": 1.635782747603834e-05, "loss": -0.0619, "reward": 1.0625000238418578, "reward_std": 0.21779928505420684, "rewards/accuracy_reward": 0.16875000596046447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8937500238418579, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 913.3833435058593, "epoch": 0.0822531605056809, "grad_norm": 0.16180475056171417, "kl": 2.9971985332667828, "learning_rate": 1.6421725239616616e-05, "loss": -0.1122, "reward": 0.8828125238418579, "reward_std": 0.2641023457050323, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.868229192495346, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 923.8458557128906, "epoch": 0.08257321171387422, "grad_norm": 0.3563595414161682, "kl": 2.980782502889633, "learning_rate": 1.648562300319489e-05, "loss": -0.0747, "reward": 0.9838541984558106, "reward_std": 0.25154028832912445, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9046875298023224, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 926.2646057128907, "epoch": 0.08289326292206753, "grad_norm": 0.13118846714496613, "kl": 1.8122212439775467, "learning_rate": 1.6549520766773163e-05, "loss": -0.0782, "reward": 1.0093750298023223, "reward_std": 0.2527278661727905, "rewards/accuracy_reward": 0.09583333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9135416746139526, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 905.1271057128906, "epoch": 0.08321331413026084, "grad_norm": 0.06394088268280029, "kl": 1.5887242540717126, "learning_rate": 1.661341853035144e-05, "loss": -0.1111, "reward": 0.9333333432674408, "reward_std": 0.2683968171477318, "rewards/accuracy_reward": 0.04583333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8875000059604645, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 892.6187744140625, "epoch": 0.08353336533845415, "grad_norm": 0.1131933182477951, "kl": 0.6811731692403555, "learning_rate": 1.6677316293929714e-05, "loss": -0.0831, "reward": 1.0098958611488342, "reward_std": 0.2009401135146618, "rewards/accuracy_reward": 0.08750000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9223958551883698, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 900.8021057128906, "epoch": 0.08385341654664746, "grad_norm": 0.08476514369249344, "kl": 1.0106553114950656, "learning_rate": 1.6741214057507987e-05, "loss": -0.131, "reward": 0.9656250298023223, "reward_std": 0.22452602237462999, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9135416805744171, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 955.2646057128907, "epoch": 0.08417346775484078, "grad_norm": 0.08032719045877457, "kl": 0.8753206558525563, "learning_rate": 1.6805111821086264e-05, "loss": -0.0422, "reward": 1.049479204416275, "reward_std": 0.2446434311568737, "rewards/accuracy_reward": 0.14791667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9015625238418579, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 899.0416931152344, "epoch": 0.08449351896303409, "grad_norm": 0.5515336394309998, "kl": 1.3456205856055021, "learning_rate": 1.6869009584664538e-05, "loss": -0.1086, "reward": 0.9744791865348816, "reward_std": 0.22072734907269478, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9098958492279052, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 885.8771057128906, "epoch": 0.0848135701712274, "grad_norm": 0.06949339061975479, "kl": 0.9026762183755637, "learning_rate": 1.693290734824281e-05, "loss": -0.098, "reward": 1.0098958551883697, "reward_std": 0.22607185021042825, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9140625238418579, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 902.6458618164063, "epoch": 0.08513362137942071, "grad_norm": 0.09065587818622589, "kl": 1.112578734010458, "learning_rate": 1.699680511182109e-05, "loss": -0.0904, "reward": 1.0500000298023224, "reward_std": 0.2474749308079481, "rewards/accuracy_reward": 0.13541666865348817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9145833492279053, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 850.6291931152343, "epoch": 0.08545367258761402, "grad_norm": 0.0645834282040596, "kl": 0.8398421596735716, "learning_rate": 1.7060702875399362e-05, "loss": -0.1208, "reward": 1.0151041805744172, "reward_std": 0.2006215013563633, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9130208551883697, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 903.7958557128907, "epoch": 0.08577372379580733, "grad_norm": 0.059141870588064194, "kl": 0.7011767126619816, "learning_rate": 1.712460063897764e-05, "loss": -0.0965, "reward": 1.0427083611488341, "reward_std": 0.19903551936149597, "rewards/accuracy_reward": 0.10833333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9343750178813934, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 868.0416870117188, "epoch": 0.08609377500400064, "grad_norm": 0.08352208137512207, "kl": 0.6763238899409771, "learning_rate": 1.7188498402555913e-05, "loss": -0.103, "reward": 0.9942708671092987, "reward_std": 0.20066261664032936, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9421875298023223, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 882.5896057128906, "epoch": 0.08641382621219396, "grad_norm": 0.058626312762498856, "kl": 1.3493116207420826, "learning_rate": 1.7252396166134186e-05, "loss": -0.0921, "reward": 1.027604192495346, "reward_std": 0.23043378591537475, "rewards/accuracy_reward": 0.10000000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9276041865348816, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 897.5166870117188, "epoch": 0.08673387742038727, "grad_norm": 0.07342275232076645, "kl": 0.8855142720043659, "learning_rate": 1.7316293929712463e-05, "loss": -0.0822, "reward": 1.0546875298023224, "reward_std": 0.27408884316682813, "rewards/accuracy_reward": 0.12916666753590106, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9255208492279052, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 889.0729370117188, "epoch": 0.08705392862858058, "grad_norm": 0.06917428970336914, "kl": 1.0903139643371105, "learning_rate": 1.7380191693290737e-05, "loss": -0.0621, "reward": 1.035416704416275, "reward_std": 0.22211865484714508, "rewards/accuracy_reward": 0.10208333674818278, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9333333551883698, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 872.6250183105469, "epoch": 0.08737397983677389, "grad_norm": 0.0981907919049263, "kl": 0.7083079494535923, "learning_rate": 1.744408945686901e-05, "loss": -0.053, "reward": 1.0489583671092988, "reward_std": 0.16593048833310603, "rewards/accuracy_reward": 0.09166666828095912, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916865348816, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 877.7541809082031, "epoch": 0.08769403104496719, "grad_norm": 0.18095174431800842, "kl": 2.6296974059194325, "learning_rate": 1.7507987220447287e-05, "loss": -0.0654, "reward": 1.018229180574417, "reward_std": 0.1441993907094002, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9473958492279053, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 875.0437744140625, "epoch": 0.0880140822531605, "grad_norm": 0.13419152796268463, "kl": 3.1912473395466803, "learning_rate": 1.757188498402556e-05, "loss": -0.0698, "reward": 0.9859375178813934, "reward_std": 0.19504168927669524, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9317708492279053, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 862.0500244140625, "epoch": 0.08833413346135381, "grad_norm": 0.21924547851085663, "kl": 3.4867818232625725, "learning_rate": 1.7635782747603835e-05, "loss": -0.0687, "reward": 0.9635416865348816, "reward_std": 0.1808617640286684, "rewards/accuracy_reward": 0.03125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.932291692495346, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 864.8125183105469, "epoch": 0.08865418466954712, "grad_norm": 0.49595341086387634, "kl": 2.0808908861130475, "learning_rate": 1.7699680511182108e-05, "loss": -0.0736, "reward": 1.0552083551883698, "reward_std": 0.2792856268584728, "rewards/accuracy_reward": 0.14166666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9135416805744171, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 814.4771118164062, "epoch": 0.08897423587774043, "grad_norm": 0.13152822852134705, "kl": 1.0621005825698375, "learning_rate": 1.7763578274760385e-05, "loss": -0.1042, "reward": 0.9630208551883698, "reward_std": 0.20329482033848761, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9171875178813934, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 760.3854309082031, "epoch": 0.08929428708593375, "grad_norm": 0.33260223269462585, "kl": 1.3636042416095733, "learning_rate": 1.782747603833866e-05, "loss": -0.152, "reward": 0.9802083730697632, "reward_std": 0.26020273864269255, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.901041692495346, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 775.677099609375, "epoch": 0.08961433829412706, "grad_norm": 0.0919167548418045, "kl": 0.8035856388509274, "learning_rate": 1.7891373801916932e-05, "loss": -0.0947, "reward": 1.0625000298023224, "reward_std": 0.23918063938617706, "rewards/accuracy_reward": 0.1604166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9020833551883698, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 843.8791809082031, "epoch": 0.08993438950232037, "grad_norm": 0.06751978397369385, "kl": 0.6697272006422281, "learning_rate": 1.795527156549521e-05, "loss": -0.0986, "reward": 0.9140625119209289, "reward_std": 0.24967138916254045, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9078125119209289, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 797.4062744140625, "epoch": 0.09025444071051368, "grad_norm": 0.06286520510911942, "kl": 0.4617580160498619, "learning_rate": 1.8019169329073486e-05, "loss": -0.0937, "reward": 0.9760416924953461, "reward_std": 0.2386911503970623, "rewards/accuracy_reward": 0.05625000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9197916865348816, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 859.6125244140625, "epoch": 0.09057449191870699, "grad_norm": 0.08970347791910172, "kl": 0.37080557718873025, "learning_rate": 1.808306709265176e-05, "loss": -0.0966, "reward": 0.9406250238418579, "reward_std": 0.22452181503176688, "rewards/accuracy_reward": 0.012500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9281250178813935, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 862.0187683105469, "epoch": 0.0908945431269003, "grad_norm": 0.11629839241504669, "kl": 0.5317555744200945, "learning_rate": 1.8146964856230033e-05, "loss": -0.0823, "reward": 0.9687500238418579, "reward_std": 0.1916698656976223, "rewards/accuracy_reward": 0.031250000558793546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9375000178813935, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 808.6896118164062, "epoch": 0.09121459433509362, "grad_norm": 0.04873790219426155, "kl": 0.41392175033688544, "learning_rate": 1.8210862619808307e-05, "loss": -0.0581, "reward": 1.0192708551883698, "reward_std": 0.18486709110438823, "rewards/accuracy_reward": 0.06250000037252904, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708492279052, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 823.9062683105469, "epoch": 0.09153464554328693, "grad_norm": 0.06124405190348625, "kl": 0.5761492840945721, "learning_rate": 1.8274760383386584e-05, "loss": -0.0575, "reward": 1.0343750298023224, "reward_std": 0.14318594969809056, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250178813935, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 852.4708557128906, "epoch": 0.09185469675148024, "grad_norm": 0.04435551166534424, "kl": 0.8433479636907577, "learning_rate": 1.8338658146964858e-05, "loss": -0.0992, "reward": 1.0718750298023223, "reward_std": 0.19699937254190444, "rewards/accuracy_reward": 0.12083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416805744171, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 819.7062805175781, "epoch": 0.09217474795967355, "grad_norm": 0.05230933800339699, "kl": 0.5397528253495694, "learning_rate": 1.840255591054313e-05, "loss": -0.0561, "reward": 0.9604166984558106, "reward_std": 0.16495687067508696, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666865348816, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 817.6687683105469, "epoch": 0.09249479916786686, "grad_norm": 0.044067054986953735, "kl": 1.0274098832160234, "learning_rate": 1.8466453674121408e-05, "loss": -0.0696, "reward": 1.0661458790302276, "reward_std": 0.138642318546772, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.968229204416275, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 798.7437683105469, "epoch": 0.09281485037606017, "grad_norm": 0.13034485280513763, "kl": 0.8338620312511921, "learning_rate": 1.8530351437699682e-05, "loss": -0.0463, "reward": 1.0578125298023224, "reward_std": 0.13859039451926947, "rewards/accuracy_reward": 0.08750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.970312523841858, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 833.6333557128906, "epoch": 0.09313490158425348, "grad_norm": 0.17142996191978455, "kl": 1.5055570479482412, "learning_rate": 1.8594249201277955e-05, "loss": -0.04, "reward": 1.0614583611488342, "reward_std": 0.19720946326851846, "rewards/accuracy_reward": 0.10833333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250238418579, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 833.6458618164063, "epoch": 0.0934549527924468, "grad_norm": 0.092293381690979, "kl": 0.9619705751538277, "learning_rate": 1.8658146964856232e-05, "loss": -0.0269, "reward": 1.0546875238418578, "reward_std": 0.11363485138863325, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 812.5312683105469, "epoch": 0.09377500400064011, "grad_norm": 0.18601863086223602, "kl": 1.3381911851465702, "learning_rate": 1.8722044728434506e-05, "loss": -0.0482, "reward": 1.0447916984558105, "reward_std": 0.15205052942037584, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750238418579, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 816.6041870117188, "epoch": 0.09409505520883342, "grad_norm": 0.3239324986934662, "kl": 2.9891052283346653, "learning_rate": 1.878594249201278e-05, "loss": -0.0762, "reward": 0.9588541865348816, "reward_std": 0.1921792333945632, "rewards/accuracy_reward": 0.020833334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9380208492279053, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 831.8125244140625, "epoch": 0.09441510641702672, "grad_norm": 0.12830707430839539, "kl": 0.6059975288808346, "learning_rate": 1.8849840255591057e-05, "loss": -0.0271, "reward": 1.0671875298023223, "reward_std": 0.10790065918117761, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041865348816, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 818.6937683105468, "epoch": 0.09473515762522003, "grad_norm": 0.04302423447370529, "kl": 1.0549761176109314, "learning_rate": 1.891373801916933e-05, "loss": -0.0439, "reward": 0.9984375238418579, "reward_std": 0.1427600732073188, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041805744171, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 814.9416870117187, "epoch": 0.09505520883341334, "grad_norm": 0.14698351919651031, "kl": 0.6408292330801487, "learning_rate": 1.8977635782747604e-05, "loss": -0.0273, "reward": 0.9041666805744171, "reward_std": 0.19298009127378463, "rewards/accuracy_reward": 0.03958333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8645833492279053, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 786.627099609375, "epoch": 0.09537526004160665, "grad_norm": 0.12881284952163696, "kl": 0.522902286797762, "learning_rate": 1.904153354632588e-05, "loss": -0.0235, "reward": 0.9333333671092987, "reward_std": 0.20901244282722473, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8708333492279052, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 867.3333557128906, "epoch": 0.09569531124979996, "grad_norm": 0.0646795704960823, "kl": 0.4893409203737974, "learning_rate": 1.9105431309904154e-05, "loss": -0.0422, "reward": 1.0057291984558105, "reward_std": 0.1382185447961092, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458611488343, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 782.7833557128906, "epoch": 0.09601536245799328, "grad_norm": 0.06920057535171509, "kl": 1.862360952794552, "learning_rate": 1.916932907348243e-05, "loss": -0.1221, "reward": 0.9901041924953461, "reward_std": 0.20693937465548515, "rewards/accuracy_reward": 0.04791666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9421875178813934, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 838.0146057128907, "epoch": 0.09633541366618659, "grad_norm": 0.06644880771636963, "kl": 1.1580650568008424, "learning_rate": 1.9233226837060705e-05, "loss": -0.1085, "reward": 0.9890625298023223, "reward_std": 0.1861676774919033, "rewards/accuracy_reward": 0.04375000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125298023224, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 843.3458557128906, "epoch": 0.0966554648743799, "grad_norm": 0.07187503576278687, "kl": 0.7149579245597124, "learning_rate": 1.929712460063898e-05, "loss": -0.0523, "reward": 1.0875000476837158, "reward_std": 0.14358032830059528, "rewards/accuracy_reward": 0.1166666703298688, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333551883698, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 793.3000244140625, "epoch": 0.09697551608257321, "grad_norm": 0.07028498500585556, "kl": 0.5270621210336686, "learning_rate": 1.9361022364217256e-05, "loss": -0.0411, "reward": 1.0192708492279052, "reward_std": 0.14648367427289485, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 851.6541809082031, "epoch": 0.09729556729076652, "grad_norm": 0.08742289245128632, "kl": 0.9225699122995138, "learning_rate": 1.942492012779553e-05, "loss": -0.0382, "reward": 1.0802083492279053, "reward_std": 0.17449979558587075, "rewards/accuracy_reward": 0.1083333371207118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750238418579, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 815.5333557128906, "epoch": 0.09761561849895983, "grad_norm": 0.148127943277359, "kl": 1.2980008512735366, "learning_rate": 1.9488817891373803e-05, "loss": -0.0697, "reward": 1.0197916865348815, "reward_std": 0.1628091825172305, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416865348816, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 822.1250244140625, "epoch": 0.09793566970715314, "grad_norm": 0.08461788296699524, "kl": 0.48892029859125613, "learning_rate": 1.955271565495208e-05, "loss": -0.026, "reward": 1.1718750298023224, "reward_std": 0.14723527543246745, "rewards/accuracy_reward": 0.19166667219251393, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083551883698, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 825.6416748046875, "epoch": 0.09825572091534646, "grad_norm": 0.09465766698122025, "kl": 1.6012873794883489, "learning_rate": 1.9616613418530353e-05, "loss": -0.0675, "reward": 1.0411458551883697, "reward_std": 0.14493329152464868, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791805744171, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 861.5354431152343, "epoch": 0.09857577212353977, "grad_norm": 0.07408089190721512, "kl": 1.533422616124153, "learning_rate": 1.9680511182108627e-05, "loss": -0.0936, "reward": 0.9979166805744171, "reward_std": 0.18721573576331138, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.956250011920929, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 880.7458618164062, "epoch": 0.09889582333173308, "grad_norm": 0.04676924645900726, "kl": 0.6797443836927414, "learning_rate": 1.97444089456869e-05, "loss": -0.0341, "reward": 1.035416692495346, "reward_std": 0.1299929341301322, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 895.7854370117187, "epoch": 0.09921587453992639, "grad_norm": 0.10597676038742065, "kl": 0.8373191263526678, "learning_rate": 1.9808306709265177e-05, "loss": -0.0276, "reward": 1.0484375178813934, "reward_std": 0.11354983560740947, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541805744171, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 871.6062622070312, "epoch": 0.0995359257481197, "grad_norm": 0.04293238744139671, "kl": 0.7700838308781386, "learning_rate": 1.987220447284345e-05, "loss": -0.0357, "reward": 1.0250000298023223, "reward_std": 0.16061475723981858, "rewards/accuracy_reward": 0.06041666846722364, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 876.1854370117187, "epoch": 0.09985597695631301, "grad_norm": 0.04478353261947632, "kl": 0.7534620493650437, "learning_rate": 1.9936102236421725e-05, "loss": -0.0471, "reward": 1.0213542044162751, "reward_std": 0.17604071646928787, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708492279053, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 847.4937805175781, "epoch": 0.10017602816450633, "grad_norm": 0.217251256108284, "kl": 1.3029839828610421, "learning_rate": 2e-05, "loss": -0.0849, "reward": 0.9958333551883698, "reward_std": 0.1536863178014755, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333551883697, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 871.7458557128906, "epoch": 0.10049607937269964, "grad_norm": 0.07100000232458115, "kl": 1.1783099208027124, "learning_rate": 1.99999937547761e-05, "loss": -0.0472, "reward": 1.017187523841858, "reward_std": 0.1712120622396469, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208432674408, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 864.9416809082031, "epoch": 0.10081613058089295, "grad_norm": 0.05442134663462639, "kl": 0.8484370153397321, "learning_rate": 1.9999975019112187e-05, "loss": -0.0856, "reward": 1.0411458730697631, "reward_std": 0.18634050339460373, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125238418579, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 812.7146057128906, "epoch": 0.10113618178908626, "grad_norm": 0.05835329368710518, "kl": 0.7999336563050747, "learning_rate": 1.9999943793031672e-05, "loss": -0.071, "reward": 1.035937535762787, "reward_std": 0.2407546177506447, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9109375238418579, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 850.0979370117187, "epoch": 0.10145623299727956, "grad_norm": 0.08183707296848297, "kl": 0.9409760713577271, "learning_rate": 1.9999900076573555e-05, "loss": -0.0842, "reward": 0.9593750357627868, "reward_std": 0.2523031309247017, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8927083551883698, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 829.1187744140625, "epoch": 0.10177628420547287, "grad_norm": 0.16990694403648376, "kl": 0.8687799766659736, "learning_rate": 1.999984386979244e-05, "loss": -0.0651, "reward": 0.9723958611488343, "reward_std": 0.20160830169916152, "rewards/accuracy_reward": 0.09166666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8807291924953461, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 812.4437683105468, "epoch": 0.10209633541366618, "grad_norm": 0.10622044652700424, "kl": 0.5579104773700237, "learning_rate": 1.999977517275853e-05, "loss": -0.0654, "reward": 0.9927083551883698, "reward_std": 0.25271194577217104, "rewards/accuracy_reward": 0.11875000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8739583432674408, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 827.0458618164063, "epoch": 0.10241638662185949, "grad_norm": 0.14122267067432404, "kl": 0.9455968786031008, "learning_rate": 1.9999693985557632e-05, "loss": -0.118, "reward": 0.9593750357627868, "reward_std": 0.20687768310308458, "rewards/accuracy_reward": 0.08333333730697631, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8760416865348816, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 791.833349609375, "epoch": 0.1027364378300528, "grad_norm": 0.1345462054014206, "kl": 0.6353983476758003, "learning_rate": 1.999960030829115e-05, "loss": -0.1179, "reward": 0.9557291865348816, "reward_std": 0.27604651898145677, "rewards/accuracy_reward": 0.09583333749324083, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8598958492279053, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 807.3687805175781, "epoch": 0.10305648903824612, "grad_norm": 0.19280694425106049, "kl": 1.307307593524456, "learning_rate": 1.99994941410761e-05, "loss": -0.139, "reward": 0.8828125238418579, "reward_std": 0.2544379264116287, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8682291865348816, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 823.5916931152344, "epoch": 0.10337654024643943, "grad_norm": 0.2708165943622589, "kl": 1.0159433037042618, "learning_rate": 1.9999375484045077e-05, "loss": -0.1316, "reward": 0.8807291924953461, "reward_std": 0.26034645885229113, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8411458551883697, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 866.6270935058594, "epoch": 0.10369659145463274, "grad_norm": 0.6394182443618774, "kl": 1.2064526498317718, "learning_rate": 1.99992443373463e-05, "loss": -0.1229, "reward": 0.9432291865348816, "reward_std": 0.275150665640831, "rewards/accuracy_reward": 0.09166666716337205, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8515625178813935, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 898.0916870117187, "epoch": 0.10401664266282605, "grad_norm": 0.07190922647714615, "kl": 1.2092401087284088, "learning_rate": 1.999910070114357e-05, "loss": -0.1126, "reward": 0.9255208551883698, "reward_std": 0.2189515396952629, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.883854192495346, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 915.2458557128906, "epoch": 0.10433669387101936, "grad_norm": 0.06787604093551636, "kl": 0.6662247460335493, "learning_rate": 1.99989445756163e-05, "loss": -0.0825, "reward": 0.9942708492279053, "reward_std": 0.20946806892752648, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9192708551883697, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 929.2458435058594, "epoch": 0.10465674507921267, "grad_norm": 0.0813896581530571, "kl": 0.6081900119781494, "learning_rate": 1.999877596095949e-05, "loss": -0.0544, "reward": 1.0223958611488342, "reward_std": 0.2039100807160139, "rewards/accuracy_reward": 0.09375000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9286458551883697, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 955.689599609375, "epoch": 0.10497679628740599, "grad_norm": 0.06041910871863365, "kl": 0.9724824227392673, "learning_rate": 1.9998594857383756e-05, "loss": -0.0732, "reward": 1.024479204416275, "reward_std": 0.20781310126185418, "rewards/accuracy_reward": 0.11666666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9078125298023224, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 923.4541870117188, "epoch": 0.1052968474955993, "grad_norm": 0.07968918979167938, "kl": 0.5805226668715477, "learning_rate": 1.99984012651153e-05, "loss": -0.0639, "reward": 1.0041666865348815, "reward_std": 0.23270507901906967, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.912500011920929, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 942.1896118164062, "epoch": 0.10561689870379261, "grad_norm": 0.04409494251012802, "kl": 1.243473695591092, "learning_rate": 1.999819518439593e-05, "loss": -0.0951, "reward": 1.0411458551883697, "reward_std": 0.2635225549340248, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8994791746139527, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 953.5604309082031, "epoch": 0.10593694991198592, "grad_norm": 0.3219020366668701, "kl": 0.40860943496227264, "learning_rate": 1.9997976615483042e-05, "loss": -0.0361, "reward": 1.0703125178813935, "reward_std": 0.22589490786194802, "rewards/accuracy_reward": 0.14583333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9244791865348816, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 940.6187683105469, "epoch": 0.10625700112017923, "grad_norm": 0.13125485181808472, "kl": 0.620623991638422, "learning_rate": 1.9997745558649647e-05, "loss": -0.0476, "reward": 0.9453125238418579, "reward_std": 0.19392418172210454, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9182291865348816, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 947.670849609375, "epoch": 0.10657705232837254, "grad_norm": 0.054816894233226776, "kl": 1.84022078178823, "learning_rate": 1.9997502014184348e-05, "loss": -0.0839, "reward": 1.018229204416275, "reward_std": 0.2765422374010086, "rewards/accuracy_reward": 0.11875000596046448, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.899479192495346, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 942.4208557128907, "epoch": 0.10689710353656585, "grad_norm": 0.099627286195755, "kl": 1.0009838584810495, "learning_rate": 1.9997245982391335e-05, "loss": -0.0466, "reward": 0.9750000238418579, "reward_std": 0.23383331745862962, "rewards/accuracy_reward": 0.05625000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9187500238418579, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 934.4166809082031, "epoch": 0.10721715474475917, "grad_norm": 0.1346578598022461, "kl": 2.6244244679808615, "learning_rate": 1.9996977463590404e-05, "loss": -0.0918, "reward": 0.9026041865348816, "reward_std": 0.2553748100996017, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8963541805744171, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 904.3145935058594, "epoch": 0.10753720595295248, "grad_norm": 0.264816552400589, "kl": 3.066745951771736, "learning_rate": 1.9996696458116953e-05, "loss": -0.1219, "reward": 1.0015625298023223, "reward_std": 0.3009426400065422, "rewards/accuracy_reward": 0.12083333898335695, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8807291924953461, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 890.7854309082031, "epoch": 0.10785725716114579, "grad_norm": 4.931536674499512, "kl": 2.893843525648117, "learning_rate": 1.9996402966321962e-05, "loss": -0.1432, "reward": 0.8864583551883698, "reward_std": 0.2413546308875084, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8843750178813934, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 856.8562622070312, "epoch": 0.1081773083693391, "grad_norm": 0.24606415629386902, "kl": 1.80047315210104, "learning_rate": 1.9996096988572026e-05, "loss": -0.1215, "reward": 0.9812500238418579, "reward_std": 0.250549491494894, "rewards/accuracy_reward": 0.06666666958481074, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9145833551883698, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 782.8062805175781, "epoch": 0.1084973595775324, "grad_norm": 0.3353343605995178, "kl": 1.0522517710924149, "learning_rate": 1.999577852524931e-05, "loss": -0.1469, "reward": 1.084375023841858, "reward_std": 0.2656236305832863, "rewards/accuracy_reward": 0.2104166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8739583551883697, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 768.8812683105468, "epoch": 0.10881741078572571, "grad_norm": 0.5174300670623779, "kl": 1.7762677013874053, "learning_rate": 1.9995447576751605e-05, "loss": -0.1842, "reward": 1.0130208492279054, "reward_std": 0.317549966275692, "rewards/accuracy_reward": 0.17916667219251395, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8338541746139526, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 610.8937683105469, "epoch": 0.10913746199391902, "grad_norm": 0.8624448776245117, "kl": 3.7803176999092103, "learning_rate": 1.999510414349227e-05, "loss": -0.3299, "reward": 0.7390625357627869, "reward_std": 0.36838062703609464, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6661458611488342, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 566.7271026611328, "epoch": 0.10945751320211233, "grad_norm": 0.5762320160865784, "kl": 2.1153181910514833, "learning_rate": 1.9994748225900277e-05, "loss": -0.3942, "reward": 0.6640625327825547, "reward_std": 0.36143629252910614, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.622395858168602, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 520.2395965576172, "epoch": 0.10977756441030564, "grad_norm": 0.8417689204216003, "kl": 1.8840416431427003, "learning_rate": 1.999437982442017e-05, "loss": -0.4202, "reward": 0.5890625149011612, "reward_std": 0.38141846358776094, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.572395846247673, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 717.4833557128907, "epoch": 0.11009761561849896, "grad_norm": 0.527220606803894, "kl": 1.1728574931621552, "learning_rate": 1.9993998939512113e-05, "loss": -0.2318, "reward": 0.7536458611488343, "reward_std": 0.3177986368536949, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7432291865348816, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 806.5271057128906, "epoch": 0.11041766682669227, "grad_norm": 0.39360710978507996, "kl": 0.9293541312217712, "learning_rate": 1.9993605571651838e-05, "loss": -0.1071, "reward": 0.9187500238418579, "reward_std": 0.28557206094264986, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8458333432674408, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 802.089599609375, "epoch": 0.11073771803488558, "grad_norm": 0.4722810387611389, "kl": 1.2333260536193849, "learning_rate": 1.9993199721330684e-05, "loss": -0.0961, "reward": 0.9656250417232514, "reward_std": 0.28917737156152723, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8677083551883698, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 814.895849609375, "epoch": 0.11105776924307889, "grad_norm": 1.1194065809249878, "kl": 1.6250961005687714, "learning_rate": 1.9992781389055576e-05, "loss": -0.089, "reward": 0.8333333492279053, "reward_std": 0.2882629260420799, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8250000178813934, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 799.427099609375, "epoch": 0.1113778204512722, "grad_norm": 0.31969064474105835, "kl": 0.6975180029869079, "learning_rate": 1.999235057534903e-05, "loss": -0.065, "reward": 0.9895833671092987, "reward_std": 0.22908852323889733, "rewards/accuracy_reward": 0.11666667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8729166984558105, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 855.2979370117188, "epoch": 0.11169787165946551, "grad_norm": 0.10219820588827133, "kl": 0.6862524829804897, "learning_rate": 1.9991907280749148e-05, "loss": -0.0616, "reward": 0.985416692495346, "reward_std": 0.2100462459027767, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9145833432674408, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 830.7208557128906, "epoch": 0.11201792286765883, "grad_norm": 0.08064544945955276, "kl": 0.41558183878660204, "learning_rate": 1.999145150580963e-05, "loss": -0.0925, "reward": 1.020312523841858, "reward_std": 0.2063022270798683, "rewards/accuracy_reward": 0.10208333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9182291865348816, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 853.3708618164062, "epoch": 0.11233797407585214, "grad_norm": 0.0841941237449646, "kl": 0.29322315752506256, "learning_rate": 1.9990983251099755e-05, "loss": -0.0366, "reward": 0.9796875238418579, "reward_std": 0.18505462631583214, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9255208551883698, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 903.9750244140625, "epoch": 0.11265802528404545, "grad_norm": 0.13465115427970886, "kl": 0.41350857689976694, "learning_rate": 1.99905025172044e-05, "loss": -0.0597, "reward": 1.0109375178813935, "reward_std": 0.24164980798959732, "rewards/accuracy_reward": 0.09166667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9192708492279053, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 899.0729309082031, "epoch": 0.11297807649223876, "grad_norm": 0.08185650408267975, "kl": 0.18878451809287072, "learning_rate": 1.999000930472401e-05, "loss": -0.0331, "reward": 1.0036458432674409, "reward_std": 0.22958993241190911, "rewards/accuracy_reward": 0.08541667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9182291746139526, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 914.9812683105469, "epoch": 0.11329812770043207, "grad_norm": 0.07535237818956375, "kl": 0.24463820680975915, "learning_rate": 1.9989503614274647e-05, "loss": -0.0303, "reward": 1.0432291984558106, "reward_std": 0.14151984304189683, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.957812511920929, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 887.8062622070313, "epoch": 0.11361817890862538, "grad_norm": 0.08462147414684296, "kl": 0.13488904759287834, "learning_rate": 1.998898544648793e-05, "loss": -0.0307, "reward": 1.0223958611488342, "reward_std": 0.18293874636292456, "rewards/accuracy_reward": 0.07083333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9515625178813935, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 883.7875183105468, "epoch": 0.1139382301168187, "grad_norm": 0.09306566417217255, "kl": 0.21404382959008217, "learning_rate": 1.9988454802011077e-05, "loss": -0.0349, "reward": 1.0661458611488341, "reward_std": 0.16979529410600663, "rewards/accuracy_reward": 0.11041666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 874.3729431152344, "epoch": 0.114258281325012, "grad_norm": 0.08024463802576065, "kl": 0.23921761214733123, "learning_rate": 1.9987911681506886e-05, "loss": -0.0446, "reward": 1.0171875357627869, "reward_std": 0.14690931476652622, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375178813935, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 870.5250122070313, "epoch": 0.11457833253320532, "grad_norm": 0.14203424751758575, "kl": 0.28406247273087504, "learning_rate": 1.9987356085653738e-05, "loss": -0.0558, "reward": 1.035416692495346, "reward_std": 0.18686591796576976, "rewards/accuracy_reward": 0.09583333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9395833432674408, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 818.483349609375, "epoch": 0.11489838374139863, "grad_norm": 0.28132274746894836, "kl": 0.4927747845649719, "learning_rate": 1.9986788015145597e-05, "loss": -0.1037, "reward": 0.997916704416275, "reward_std": 0.19941441863775253, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9416666924953461, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 880.3646057128906, "epoch": 0.11521843494959194, "grad_norm": 0.05523926019668579, "kl": 0.2463509477674961, "learning_rate": 1.9986207470692012e-05, "loss": -0.023, "reward": 1.0234375298023224, "reward_std": 0.1478660933673382, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541865348816, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 821.1583557128906, "epoch": 0.11553848615778524, "grad_norm": 0.05657940357923508, "kl": 0.20335216149687768, "learning_rate": 1.99856144530181e-05, "loss": -0.0431, "reward": 1.0885416924953462, "reward_std": 0.16149957925081254, "rewards/accuracy_reward": 0.12708333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583551883698, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 840.4000122070313, "epoch": 0.11585853736597855, "grad_norm": 0.06203540042042732, "kl": 0.3108247257769108, "learning_rate": 1.9985008962864582e-05, "loss": -0.0275, "reward": 1.0005208551883698, "reward_std": 0.17917531132698059, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875119209289, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 803.0208557128906, "epoch": 0.11617858857417186, "grad_norm": 0.07565395534038544, "kl": 0.26133017241954803, "learning_rate": 1.998439100098773e-05, "loss": -0.0152, "reward": 1.1135416984558106, "reward_std": 0.16440754048526288, "rewards/accuracy_reward": 0.1500000035390258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.963541692495346, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 850.1541809082031, "epoch": 0.11649863978236517, "grad_norm": 0.10145269334316254, "kl": 0.15966624915599822, "learning_rate": 1.998376056815941e-05, "loss": -0.0171, "reward": 1.0598958611488343, "reward_std": 0.1260958842933178, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 796.1166809082031, "epoch": 0.11681869099055849, "grad_norm": 0.05918211117386818, "kl": 0.19708489403128623, "learning_rate": 1.998311766516706e-05, "loss": -0.0228, "reward": 1.0380208671092988, "reward_std": 0.12481046803295612, "rewards/accuracy_reward": 0.06458333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 813.2312683105469, "epoch": 0.1171387421987518, "grad_norm": 0.4396161735057831, "kl": 0.4802224151790142, "learning_rate": 1.9982462292813693e-05, "loss": -0.0249, "reward": 1.076562523841858, "reward_std": 0.12219937145709991, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291865348815, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 806.1104309082032, "epoch": 0.11745879340694511, "grad_norm": 0.19936463236808777, "kl": 0.4187732309103012, "learning_rate": 1.99817944519179e-05, "loss": -0.0197, "reward": 0.9619791805744171, "reward_std": 0.1182825243100524, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 776.3541809082031, "epoch": 0.11777884461513842, "grad_norm": 0.16198308765888214, "kl": 0.5005293294787407, "learning_rate": 1.998111414331385e-05, "loss": 0.0007, "reward": 1.0546875298023224, "reward_std": 0.15743937194347382, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041746139526, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 801.1104370117188, "epoch": 0.11809889582333173, "grad_norm": 0.09354270994663239, "kl": 0.3035903625190258, "learning_rate": 1.9980421367851268e-05, "loss": -0.0126, "reward": 1.025000023841858, "reward_std": 0.1401256587356329, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9562500178813934, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 722.7916931152344, "epoch": 0.11841894703152504, "grad_norm": 0.08353027701377869, "kl": 0.282787824422121, "learning_rate": 1.997971612639547e-05, "loss": -0.007, "reward": 1.0192708611488341, "reward_std": 0.149939689040184, "rewards/accuracy_reward": 0.05416666921228171, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 701.8291870117188, "epoch": 0.11873899823971835, "grad_norm": 0.06042907014489174, "kl": 0.4572564627975225, "learning_rate": 1.9978998419827328e-05, "loss": -0.0004, "reward": 1.0651041924953462, "reward_std": 0.12739601023495198, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375178813935, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 751.1271057128906, "epoch": 0.11905904944791167, "grad_norm": 0.06057741865515709, "kl": 0.5198903292417526, "learning_rate": 1.9978268249043296e-05, "loss": -0.0026, "reward": 1.0135417044162751, "reward_std": 0.17310468517243863, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083551883698, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 774.7937744140625, "epoch": 0.11937910065610498, "grad_norm": 0.043199047446250916, "kl": 0.21139018721878527, "learning_rate": 1.9977525614955388e-05, "loss": -0.0245, "reward": 1.0348958611488341, "reward_std": 0.11672738809138536, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 740.489599609375, "epoch": 0.11969915186429829, "grad_norm": 0.10318101197481155, "kl": 0.26686358749866484, "learning_rate": 1.9976770518491184e-05, "loss": -0.0266, "reward": 1.0958333611488342, "reward_std": 0.1272334760054946, "rewards/accuracy_reward": 0.12916667088866235, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666865348816, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 719.5041870117187, "epoch": 0.1200192030724916, "grad_norm": 0.07899950444698334, "kl": 0.3719005145132542, "learning_rate": 1.9976002960593833e-05, "loss": -0.0268, "reward": 1.0302083611488342, "reward_std": 0.1475877858698368, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 741.1729309082032, "epoch": 0.12033925428068491, "grad_norm": 0.04784136265516281, "kl": 0.19529346860945224, "learning_rate": 1.9975222942222054e-05, "loss": -0.0211, "reward": 1.074479192495346, "reward_std": 0.10090533643960953, "rewards/accuracy_reward": 0.0895833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 741.6521118164062, "epoch": 0.12065930548887822, "grad_norm": 0.062281284481287, "kl": 0.323552380874753, "learning_rate": 1.9974430464350125e-05, "loss": -0.0074, "reward": 0.9817708432674408, "reward_std": 0.10530988723039628, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 756.4166809082031, "epoch": 0.12097935669707154, "grad_norm": 0.030522586777806282, "kl": 0.09794650189578533, "learning_rate": 1.997362552796788e-05, "loss": -0.0219, "reward": 1.0385416865348815, "reward_std": 0.09175706021487713, "rewards/accuracy_reward": 0.05625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 735.7312744140625, "epoch": 0.12129940790526485, "grad_norm": 0.0645739808678627, "kl": 0.14556628093123436, "learning_rate": 1.9972808134080726e-05, "loss": -0.0531, "reward": 1.093750035762787, "reward_std": 0.14756546672433615, "rewards/accuracy_reward": 0.11875000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 737.6625183105468, "epoch": 0.12161945911345816, "grad_norm": 0.06165073439478874, "kl": 0.16611265018582344, "learning_rate": 1.9971978283709624e-05, "loss": 0.0031, "reward": 1.0317708551883698, "reward_std": 0.11711971126496792, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041865348816, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 768.0479431152344, "epoch": 0.12193951032165147, "grad_norm": 0.08346700668334961, "kl": 0.1517067790031433, "learning_rate": 1.9971135977891093e-05, "loss": 0.0006, "reward": 1.0239583611488343, "reward_std": 0.16878514513373374, "rewards/accuracy_reward": 0.07500000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9489583551883698, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 697.964599609375, "epoch": 0.12225956152984478, "grad_norm": 0.20756888389587402, "kl": 0.3224208764731884, "learning_rate": 1.9970281217677207e-05, "loss": 0.0314, "reward": 1.0057291865348816, "reward_std": 0.20953557565808295, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9369791865348815, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 760.6562744140625, "epoch": 0.12257961273803808, "grad_norm": 0.12297973781824112, "kl": 0.1835591211915016, "learning_rate": 1.996941400413561e-05, "loss": 0.0196, "reward": 0.9614583671092987, "reward_std": 0.15597959961742164, "rewards/accuracy_reward": 0.014583333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9468750298023224, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 731.2791809082031, "epoch": 0.12289966394623139, "grad_norm": 0.14354756474494934, "kl": 0.22697254866361619, "learning_rate": 1.996853433834948e-05, "loss": -0.0106, "reward": 1.0005208551883698, "reward_std": 0.13219761587679385, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875178813934, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 721.2291809082031, "epoch": 0.1232197151544247, "grad_norm": 0.3283243179321289, "kl": 0.7633262783288955, "learning_rate": 1.996764222141756e-05, "loss": -0.0299, "reward": 0.9703125298023224, "reward_std": 0.25556026250123975, "rewards/accuracy_reward": 0.05833333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9119791924953461, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 719.9437683105468, "epoch": 0.12353976636261801, "grad_norm": 0.8115874528884888, "kl": 0.4921633303165436, "learning_rate": 1.9966737654454153e-05, "loss": -0.0014, "reward": 0.9791666984558105, "reward_std": 0.21891369968652724, "rewards/accuracy_reward": 0.0645833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9145833671092987, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 712.5854431152344, "epoch": 0.12385981757081133, "grad_norm": 0.18551310896873474, "kl": 0.20581552758812904, "learning_rate": 1.9965820638589095e-05, "loss": 0.052, "reward": 1.1244791924953461, "reward_std": 0.19005530402064325, "rewards/accuracy_reward": 0.1687500074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291865348816, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 771.4646057128906, "epoch": 0.12417986877900464, "grad_norm": 0.09523138403892517, "kl": 0.0998759150505066, "learning_rate": 1.9964891174967786e-05, "loss": 0.0152, "reward": 0.9786458432674408, "reward_std": 0.14558621719479561, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.951562511920929, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 762.3541931152344, "epoch": 0.12449991998719795, "grad_norm": 0.05193324387073517, "kl": 0.09262499958276749, "learning_rate": 1.996394926475116e-05, "loss": 0.0111, "reward": 1.0489583432674408, "reward_std": 0.1252418950200081, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 715.6854370117187, "epoch": 0.12481997119539126, "grad_norm": 0.06766840070486069, "kl": 0.10407169163227081, "learning_rate": 1.996299490911571e-05, "loss": 0.0009, "reward": 1.0505208671092987, "reward_std": 0.15630092974752188, "rewards/accuracy_reward": 0.0875000013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208551883698, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 739.745849609375, "epoch": 0.12514002240358457, "grad_norm": 0.06490278989076614, "kl": 0.22728676721453667, "learning_rate": 1.9962028109253474e-05, "loss": 0.0092, "reward": 0.9703125119209289, "reward_std": 0.1388203686103225, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 788.3354370117188, "epoch": 0.12546007361177788, "grad_norm": 0.07218482345342636, "kl": 0.12018184587359429, "learning_rate": 1.9961048866372016e-05, "loss": 0.0265, "reward": 1.0890625238418579, "reward_std": 0.1456417091190815, "rewards/accuracy_reward": 0.1250000024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9640625178813934, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 780.0896057128906, "epoch": 0.1257801248199712, "grad_norm": 0.09532662481069565, "kl": 0.10603573620319366, "learning_rate": 1.9960057181694464e-05, "loss": 0.0204, "reward": 1.0380208492279053, "reward_std": 0.10570584982633591, "rewards/accuracy_reward": 0.07083333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.967187511920929, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 766.1437683105469, "epoch": 0.1261001760281645, "grad_norm": 0.07415325194597244, "kl": 0.16074122115969658, "learning_rate": 1.9959053056459474e-05, "loss": 0.0278, "reward": 1.024479192495346, "reward_std": 0.13585025519132615, "rewards/accuracy_reward": 0.06458333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 751.6562805175781, "epoch": 0.12642022723635782, "grad_norm": 0.050613872706890106, "kl": 0.09890075251460076, "learning_rate": 1.995803649192124e-05, "loss": 0.0305, "reward": 1.012500023841858, "reward_std": 0.16258562356233597, "rewards/accuracy_reward": 0.05000000093132258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000119209289, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 792.3625183105469, "epoch": 0.12674027844455113, "grad_norm": 0.07034117728471756, "kl": 0.1736940063536167, "learning_rate": 1.9957007489349505e-05, "loss": 0.0082, "reward": 1.0932291924953461, "reward_std": 0.12479073386639357, "rewards/accuracy_reward": 0.11458333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 803.2937683105469, "epoch": 0.12706032965274444, "grad_norm": 0.1689036339521408, "kl": 0.11238553300499916, "learning_rate": 1.995596605002953e-05, "loss": 0.0313, "reward": 1.078125035762787, "reward_std": 0.14898527152836322, "rewards/accuracy_reward": 0.1104166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083551883697, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 820.8812744140625, "epoch": 0.12738038086093775, "grad_norm": 0.1099768802523613, "kl": 0.09775342904031277, "learning_rate": 1.9954912175262122e-05, "loss": 0.0426, "reward": 1.015625011920929, "reward_std": 0.13907793909311295, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583432674408, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 788.308349609375, "epoch": 0.12770043206913106, "grad_norm": 0.33350008726119995, "kl": 0.5557160004973412, "learning_rate": 1.995384586636362e-05, "loss": -0.0012, "reward": 1.067187523841858, "reward_std": 0.12045919597148895, "rewards/accuracy_reward": 0.10208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 744.6021057128906, "epoch": 0.12802048327732438, "grad_norm": 0.14938224852085114, "kl": 0.1822477553039789, "learning_rate": 1.9952767124665892e-05, "loss": 0.0283, "reward": 1.0151041924953461, "reward_std": 0.14971655271947384, "rewards/accuracy_reward": 0.06250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041865348815, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 792.327099609375, "epoch": 0.1283405344855177, "grad_norm": 0.063927561044693, "kl": 0.1587657429277897, "learning_rate": 1.995167595151633e-05, "loss": 0.0413, "reward": 1.0302083671092988, "reward_std": 0.21063872575759887, "rewards/accuracy_reward": 0.08750000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083611488343, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 761.5937683105469, "epoch": 0.128660585693711, "grad_norm": 0.17094162106513977, "kl": 0.2179608315229416, "learning_rate": 1.9950572348277862e-05, "loss": 0.0222, "reward": 1.0036458551883698, "reward_std": 0.17909386456012727, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.932812511920929, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 768.9104309082031, "epoch": 0.1289806369019043, "grad_norm": 0.09395050257444382, "kl": 0.18149636760354043, "learning_rate": 1.9949456316328942e-05, "loss": 0.0304, "reward": 1.0598958492279054, "reward_std": 0.17585733085870742, "rewards/accuracy_reward": 0.12083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625178813934, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 814.5562622070313, "epoch": 0.12930068811009762, "grad_norm": 0.14399601519107819, "kl": 0.2368574135005474, "learning_rate": 1.9948327857063536e-05, "loss": 0.0752, "reward": 0.9562500298023224, "reward_std": 0.22825298383831977, "rewards/accuracy_reward": 0.04375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9125000238418579, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 792.5541870117188, "epoch": 0.12962073931829093, "grad_norm": 0.14934572577476501, "kl": 0.2532314211130142, "learning_rate": 1.9947186971891143e-05, "loss": 0.0565, "reward": 0.9265625298023223, "reward_std": 0.25942430049180987, "rewards/accuracy_reward": 0.03333333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8932291865348816, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 788.277099609375, "epoch": 0.12994079052648425, "grad_norm": 0.07283183187246323, "kl": 0.11642909124493599, "learning_rate": 1.9946033662236778e-05, "loss": 0.0257, "reward": 1.043750023841858, "reward_std": 0.19481766372919082, "rewards/accuracy_reward": 0.11250000447034836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.931250023841858, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 776.9812683105469, "epoch": 0.13026084173467756, "grad_norm": 0.23960764706134796, "kl": 0.1925484672188759, "learning_rate": 1.994486792954098e-05, "loss": 0.0636, "reward": 0.9286458611488342, "reward_std": 0.20316977724432944, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9182291865348816, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 784.9062622070312, "epoch": 0.13058089294287087, "grad_norm": 0.3904155194759369, "kl": 0.26361445263028144, "learning_rate": 1.99436897752598e-05, "loss": 0.0458, "reward": 0.986979192495346, "reward_std": 0.23148050159215927, "rewards/accuracy_reward": 0.07916666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9078125178813934, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 727.0875244140625, "epoch": 0.13090094415106418, "grad_norm": 0.1020391508936882, "kl": 0.31486469730734823, "learning_rate": 1.9942499200864805e-05, "loss": 0.0486, "reward": 1.0520833492279054, "reward_std": 0.2606917768716812, "rewards/accuracy_reward": 0.14791667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9041666865348816, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 751.4396057128906, "epoch": 0.1312209953592575, "grad_norm": 0.11322091519832611, "kl": 0.16842807456851006, "learning_rate": 1.994129620784307e-05, "loss": 0.0705, "reward": 1.0145833551883698, "reward_std": 0.18669217731803656, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9312500178813934, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 718.8125183105469, "epoch": 0.1315410465674508, "grad_norm": 0.14536863565444946, "kl": 0.3029049329459667, "learning_rate": 1.9940080797697203e-05, "loss": 0.0313, "reward": 1.0515625178813934, "reward_std": 0.2398408681154251, "rewards/accuracy_reward": 0.13958334028720856, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9119791865348816, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 765.3958557128906, "epoch": 0.13186109777564411, "grad_norm": 0.20916740596294403, "kl": 0.19605226889252664, "learning_rate": 1.993885297194529e-05, "loss": 0.0686, "reward": 0.9598958611488342, "reward_std": 0.18760251104831696, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9348958611488343, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 730.7062805175781, "epoch": 0.13218114898383743, "grad_norm": 0.38499322533607483, "kl": 0.2664633825421333, "learning_rate": 1.9937612732120947e-05, "loss": 0.0846, "reward": 1.0156250238418578, "reward_std": 0.2381811335682869, "rewards/accuracy_reward": 0.10833333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9072916805744171, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 704.5041931152343, "epoch": 0.13250120019203074, "grad_norm": 0.2399313747882843, "kl": 0.23138594403862953, "learning_rate": 1.9936360079773287e-05, "loss": 0.0258, "reward": 1.0177083551883697, "reward_std": 0.13459027968347073, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9489583611488343, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 679.6562805175781, "epoch": 0.13282125140022405, "grad_norm": 0.12286972999572754, "kl": 0.19738261252641678, "learning_rate": 1.993509501646693e-05, "loss": 0.0594, "reward": 0.9828125357627868, "reward_std": 0.17468988746404648, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9369791865348815, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 700.7916870117188, "epoch": 0.13314130260841733, "grad_norm": 0.09988022595643997, "kl": 0.1340769723057747, "learning_rate": 1.9933817543781998e-05, "loss": 0.0263, "reward": 1.0812500238418579, "reward_std": 0.10942931771278382, "rewards/accuracy_reward": 0.11041667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 683.0896118164062, "epoch": 0.13346135381661065, "grad_norm": 0.1567991077899933, "kl": 0.16551008746027945, "learning_rate": 1.9932527663314113e-05, "loss": 0.0718, "reward": 1.1494791865348817, "reward_std": 0.19006953090429307, "rewards/accuracy_reward": 0.1979166731238365, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9515625178813935, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 681.19169921875, "epoch": 0.13378140502480396, "grad_norm": 0.09303200244903564, "kl": 0.1856502816081047, "learning_rate": 1.9931225376674388e-05, "loss": 0.0536, "reward": 0.9911458492279053, "reward_std": 0.13494500890374184, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 687.7937683105469, "epoch": 0.13410145623299727, "grad_norm": 0.1906178593635559, "kl": 0.13947633281350136, "learning_rate": 1.992991068548944e-05, "loss": 0.0264, "reward": 1.035937523841858, "reward_std": 0.13257458936423064, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375238418579, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 660.9916870117188, "epoch": 0.13442150744119058, "grad_norm": 0.07890919595956802, "kl": 0.14936097264289855, "learning_rate": 1.9928583591401376e-05, "loss": 0.061, "reward": 1.021875023841858, "reward_std": 0.12687795273959637, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916746139527, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 636.2854431152343, "epoch": 0.1347415586493839, "grad_norm": 0.1735936552286148, "kl": 0.1333605393767357, "learning_rate": 1.99272440960678e-05, "loss": 0.049, "reward": 1.0536458551883698, "reward_std": 0.13581256624311208, "rewards/accuracy_reward": 0.07291666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 653.3291931152344, "epoch": 0.1350616098575772, "grad_norm": 0.07206610590219498, "kl": 0.12243280112743378, "learning_rate": 1.9925892201161794e-05, "loss": 0.0189, "reward": 1.0270833551883698, "reward_std": 0.12921709027141332, "rewards/accuracy_reward": 0.05416666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 599.7333435058594, "epoch": 0.13538166106577051, "grad_norm": 0.08999348431825638, "kl": 0.15753435716032982, "learning_rate": 1.9924527908371942e-05, "loss": 0.0224, "reward": 1.1036458671092988, "reward_std": 0.08559925891458989, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 695.9896118164063, "epoch": 0.13570171227396383, "grad_norm": 0.14725755155086517, "kl": 0.19224329441785812, "learning_rate": 1.9923151219402308e-05, "loss": 0.0054, "reward": 1.025000023841858, "reward_std": 0.18301806673407556, "rewards/accuracy_reward": 0.07083333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666865348816, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 676.3000183105469, "epoch": 0.13602176348215714, "grad_norm": 0.15762047469615936, "kl": 0.16184305176138877, "learning_rate": 1.9921762135972433e-05, "loss": 0.0042, "reward": 1.0750000417232513, "reward_std": 0.11091457530856133, "rewards/accuracy_reward": 0.10416667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333611488342, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 672.7541900634766, "epoch": 0.13634181469035045, "grad_norm": 0.18144048750400543, "kl": 0.1525034002959728, "learning_rate": 1.9920360659817345e-05, "loss": 0.0096, "reward": 1.0182291865348816, "reward_std": 0.11338572651147842, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 671.8291870117188, "epoch": 0.13666186589854376, "grad_norm": 0.5098335146903992, "kl": 0.28342188000679014, "learning_rate": 1.9918946792687553e-05, "loss": -0.0123, "reward": 1.0364583671092986, "reward_std": 0.1355344034731388, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583551883698, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 660.2937744140625, "epoch": 0.13698191710673707, "grad_norm": 0.6447596549987793, "kl": 0.19238312169909477, "learning_rate": 1.9917520536349043e-05, "loss": -0.0135, "reward": 1.0156250298023224, "reward_std": 0.1723696757107973, "rewards/accuracy_reward": 0.06458333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416865348816, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 674.4146057128906, "epoch": 0.13730196831493038, "grad_norm": 0.3913459777832031, "kl": 0.6216646380722523, "learning_rate": 1.9916081892583264e-05, "loss": -0.0312, "reward": 1.0171875298023223, "reward_std": 0.14305626451969147, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375178813935, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 651.2750183105469, "epoch": 0.1376220195231237, "grad_norm": 2.6290104389190674, "kl": 2.6219520051032306, "learning_rate": 1.9914630863187156e-05, "loss": 0.0012, "reward": 1.2223958551883698, "reward_std": 0.13749304972589016, "rewards/accuracy_reward": 0.2604166746139526, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791865348816, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 639.3666870117188, "epoch": 0.137942070731317, "grad_norm": 0.130653515458107, "kl": 0.22933876588940622, "learning_rate": 1.991316744997311e-05, "loss": -0.0182, "reward": 1.1442708611488341, "reward_std": 0.2026175085455179, "rewards/accuracy_reward": 0.18541667368263007, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541924953461, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 644.3041931152344, "epoch": 0.13826212193951032, "grad_norm": 0.14742442965507507, "kl": 0.15334233343601228, "learning_rate": 1.9911691654769004e-05, "loss": -0.017, "reward": 1.0322916865348817, "reward_std": 0.11324199475347996, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583551883698, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 635.5687744140625, "epoch": 0.13858217314770363, "grad_norm": 0.10946609079837799, "kl": 0.2557044789195061, "learning_rate": 1.991020347941817e-05, "loss": -0.0395, "reward": 1.055729192495346, "reward_std": 0.17008791603147982, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 641.2375244140625, "epoch": 0.13890222435589694, "grad_norm": 0.37317514419555664, "kl": 0.4989757601171732, "learning_rate": 1.99087029257794e-05, "loss": -0.073, "reward": 1.0625000238418578, "reward_std": 0.1615608898922801, "rewards/accuracy_reward": 0.11875000596046448, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9437500238418579, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 670.2771118164062, "epoch": 0.13922227556409025, "grad_norm": 0.4209003448486328, "kl": 0.7271633610129357, "learning_rate": 1.990718999572696e-05, "loss": -0.0251, "reward": 1.0343750298023224, "reward_std": 0.15719158351421356, "rewards/accuracy_reward": 0.08333333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416805744171, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 663.7791931152344, "epoch": 0.13954232677228356, "grad_norm": 0.3114742934703827, "kl": 0.837194798886776, "learning_rate": 1.9905664691150567e-05, "loss": -0.0674, "reward": 1.0057291865348816, "reward_std": 0.20192696750164033, "rewards/accuracy_reward": 0.06041666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125178813935, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 706.2041870117188, "epoch": 0.13986237798047688, "grad_norm": 0.32630032300949097, "kl": 0.714079699665308, "learning_rate": 1.9904127013955385e-05, "loss": -0.0474, "reward": 0.9906250298023224, "reward_std": 0.1500589970499277, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250238418579, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 729.1125244140625, "epoch": 0.1401824291886702, "grad_norm": 0.21482321619987488, "kl": 1.2284217976033687, "learning_rate": 1.990257696606205e-05, "loss": -0.0535, "reward": 0.970312523841858, "reward_std": 0.17871998697519303, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125178813935, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 685.1937683105468, "epoch": 0.1405024803968635, "grad_norm": 0.15007822215557098, "kl": 0.6202835611999035, "learning_rate": 1.9901014549406647e-05, "loss": -0.047, "reward": 1.0390625238418578, "reward_std": 0.1587829865515232, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.955729192495346, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 674.8812683105468, "epoch": 0.1408225316050568, "grad_norm": 0.0724816843867302, "kl": 0.7360513672232628, "learning_rate": 1.9899439765940687e-05, "loss": -0.0453, "reward": 1.043229192495346, "reward_std": 0.2185194693505764, "rewards/accuracy_reward": 0.11666667200624943, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.926562511920929, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 664.8854309082031, "epoch": 0.14114258281325012, "grad_norm": 0.1124361976981163, "kl": 0.5966664545238018, "learning_rate": 1.989785261763116e-05, "loss": -0.0513, "reward": 1.0666667103767395, "reward_std": 0.17953022867441176, "rewards/accuracy_reward": 0.11041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9562500238418579, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 738.7437622070313, "epoch": 0.14146263402144343, "grad_norm": 0.07581885904073715, "kl": 0.2596233807504177, "learning_rate": 1.9896253106460484e-05, "loss": -0.0163, "reward": 1.0000000238418578, "reward_std": 0.1336134120821953, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520833432674408, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 709.8104370117187, "epoch": 0.14178268522963675, "grad_norm": 0.10569097846746445, "kl": 0.3374425023794174, "learning_rate": 1.9894641234426512e-05, "loss": -0.0491, "reward": 1.0265625357627868, "reward_std": 0.18511903360486032, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291865348816, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 713.5937683105469, "epoch": 0.14210273643783006, "grad_norm": 0.06325946003198624, "kl": 0.23610839396715164, "learning_rate": 1.989301700354255e-05, "loss": -0.0304, "reward": 1.028125023841858, "reward_std": 0.13945788703858852, "rewards/accuracy_reward": 0.058333336375653744, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969791692495346, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 761.3291870117188, "epoch": 0.14242278764602337, "grad_norm": 0.06835056841373444, "kl": 0.48913782387971877, "learning_rate": 1.9891380415837333e-05, "loss": -0.049, "reward": 1.0885416984558105, "reward_std": 0.13168836012482643, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083432674408, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 700.1416809082032, "epoch": 0.14274283885421668, "grad_norm": 0.20968970656394958, "kl": 1.0379090007394551, "learning_rate": 1.9889731473355037e-05, "loss": -0.0439, "reward": 1.088541680574417, "reward_std": 0.17453248277306557, "rewards/accuracy_reward": 0.12916667181998492, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750178813935, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 709.220849609375, "epoch": 0.14306289006241, "grad_norm": 0.10043805092573166, "kl": 0.9384193673729897, "learning_rate": 1.9888070178155255e-05, "loss": -0.0607, "reward": 0.9505208551883697, "reward_std": 0.1598832830786705, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208551883697, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 738.3750183105469, "epoch": 0.1433829412706033, "grad_norm": 0.0979234129190445, "kl": 0.8160348013043404, "learning_rate": 1.9886396532313033e-05, "loss": -0.0677, "reward": 1.085937511920929, "reward_std": 0.169689111225307, "rewards/accuracy_reward": 0.13333333432674407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041805744171, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 747.2396057128906, "epoch": 0.14370299247879662, "grad_norm": 0.14574353396892548, "kl": 0.419120055437088, "learning_rate": 1.9884710537918817e-05, "loss": -0.0409, "reward": 1.0651041865348816, "reward_std": 0.1468802396208048, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375119209289, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 696.0854400634765, "epoch": 0.14402304368698993, "grad_norm": 0.16010096669197083, "kl": 0.9446754395961762, "learning_rate": 1.9883012197078497e-05, "loss": -0.0705, "reward": 1.1078125298023225, "reward_std": 0.14860073514282704, "rewards/accuracy_reward": 0.1479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958432674408, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 707.6729370117188, "epoch": 0.14434309489518324, "grad_norm": 0.1689026951789856, "kl": 0.75221516340971, "learning_rate": 1.9881301511913372e-05, "loss": -0.0726, "reward": 1.0015625298023223, "reward_std": 0.20324954241514206, "rewards/accuracy_reward": 0.06041666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9411458492279052, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 748.652099609375, "epoch": 0.14466314610337655, "grad_norm": 0.3587219715118408, "kl": 0.660324102267623, "learning_rate": 1.987957848456017e-05, "loss": -0.0702, "reward": 1.0984375417232513, "reward_std": 0.17780651152133942, "rewards/accuracy_reward": 0.14375000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875178813934, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 756.3708557128906, "epoch": 0.14498319731156986, "grad_norm": 0.12359297275543213, "kl": 0.466735539957881, "learning_rate": 1.9877843117171025e-05, "loss": -0.0601, "reward": 0.9791666865348816, "reward_std": 0.14083536192774773, "rewards/accuracy_reward": 0.018750001117587088, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166805744171, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 719.5229370117188, "epoch": 0.14530324851976317, "grad_norm": 0.16601644456386566, "kl": 0.2682775568217039, "learning_rate": 1.9876095411913492e-05, "loss": -0.0461, "reward": 1.0067708611488342, "reward_std": 0.14443401992321014, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541865348816, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 730.7833557128906, "epoch": 0.14562329972795648, "grad_norm": 0.08124036341905594, "kl": 0.20361214652657508, "learning_rate": 1.9874335370970527e-05, "loss": -0.0202, "reward": 1.025000011920929, "reward_std": 0.10506038665771485, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 742.1937774658203, "epoch": 0.1459433509361498, "grad_norm": 0.15889286994934082, "kl": 0.24789012856781484, "learning_rate": 1.9872562996540506e-05, "loss": -0.0089, "reward": 0.957812511920929, "reward_std": 0.13532231301069259, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 673.864599609375, "epoch": 0.1462634021443431, "grad_norm": 0.10304388403892517, "kl": 0.5019782140851021, "learning_rate": 1.9870778290837198e-05, "loss": -0.0221, "reward": 1.0114583551883698, "reward_std": 0.18528176210820674, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.959375011920929, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 751.6729248046875, "epoch": 0.14658345335253642, "grad_norm": 0.46115314960479736, "kl": 0.4596158303320408, "learning_rate": 1.986898125608979e-05, "loss": -0.0049, "reward": 0.9968750059604645, "reward_std": 0.159324312210083, "rewards/accuracy_reward": 0.045833333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416746139526, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 728.0625244140625, "epoch": 0.1469035045607297, "grad_norm": 0.1962408721446991, "kl": 1.2458348341286183, "learning_rate": 1.9867171894542848e-05, "loss": -0.0097, "reward": 1.0625000238418578, "reward_std": 0.20020099878311157, "rewards/accuracy_reward": 0.12708333786576986, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9354166746139526, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 699.1791931152344, "epoch": 0.14722355576892301, "grad_norm": 0.26008763909339905, "kl": 1.0917382821440698, "learning_rate": 1.9865350208456354e-05, "loss": -0.0603, "reward": 0.9786458551883698, "reward_std": 0.21206720396876336, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9098958551883698, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 641.1541870117187, "epoch": 0.14754360697711633, "grad_norm": 0.2097797393798828, "kl": 0.6077730596065521, "learning_rate": 1.986351620010567e-05, "loss": -0.0465, "reward": 0.9921875238418579, "reward_std": 0.2253425493836403, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9005208373069763, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 664.3604400634765, "epoch": 0.14786365818530964, "grad_norm": 0.16634413599967957, "kl": 0.8130259275436401, "learning_rate": 1.9861669871781558e-05, "loss": -0.056, "reward": 0.9109375178813934, "reward_std": 0.2595849081873894, "rewards/accuracy_reward": 0.03750000167638064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.873437511920929, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 656.364599609375, "epoch": 0.14818370939350295, "grad_norm": 0.20586910843849182, "kl": 0.7062367737293244, "learning_rate": 1.9859811225790164e-05, "loss": -0.0838, "reward": 0.9447916805744171, "reward_std": 0.25804681032896043, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8760416805744171, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 686.0896057128906, "epoch": 0.14850376060169626, "grad_norm": 0.1890844851732254, "kl": 0.8060359954833984, "learning_rate": 1.9857940264453015e-05, "loss": -0.0408, "reward": 1.0729166984558105, "reward_std": 0.27789904475212096, "rewards/accuracy_reward": 0.18958333879709244, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8833333432674408, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 699.0854431152344, "epoch": 0.14882381180988957, "grad_norm": 0.1812220811843872, "kl": 0.8772768050432205, "learning_rate": 1.9856056990107035e-05, "loss": -0.0376, "reward": 0.9104166924953461, "reward_std": 0.2736320853233337, "rewards/accuracy_reward": 0.025000000558793544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.885416692495346, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 600.462515258789, "epoch": 0.14914386301808288, "grad_norm": 0.4718879461288452, "kl": 0.8429543949663639, "learning_rate": 1.9854161405104512e-05, "loss": -0.0093, "reward": 1.0270833551883698, "reward_std": 0.2788348212838173, "rewards/accuracy_reward": 0.12083333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9062500119209289, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 671.1646057128906, "epoch": 0.1494639142262762, "grad_norm": 0.12379126250743866, "kl": 0.4919497549533844, "learning_rate": 1.9852253511813117e-05, "loss": -0.026, "reward": 0.9906250178813935, "reward_std": 0.17988998740911483, "rewards/accuracy_reward": 0.037500000186264516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250178813935, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 699.8041931152344, "epoch": 0.1497839654344695, "grad_norm": 0.0915331318974495, "kl": 0.4208625890314579, "learning_rate": 1.9850333312615895e-05, "loss": 0.0, "reward": 0.9916666865348815, "reward_std": 0.13873637914657594, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520833432674408, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 650.2416931152344, "epoch": 0.15010401664266282, "grad_norm": 0.06342560797929764, "kl": 0.2490247033536434, "learning_rate": 1.9848400809911255e-05, "loss": -0.0083, "reward": 1.0442708551883697, "reward_std": 0.13400398399680852, "rewards/accuracy_reward": 0.07291667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541746139527, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 678.0625183105469, "epoch": 0.15042406785085613, "grad_norm": 0.08342447876930237, "kl": 0.3054353781044483, "learning_rate": 1.9846456006112993e-05, "loss": 0.0171, "reward": 1.0041666865348815, "reward_std": 0.11574141420423985, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666746139526, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 665.9395935058594, "epoch": 0.15074411905904944, "grad_norm": 0.05020037665963173, "kl": 0.21988069340586663, "learning_rate": 1.9844498903650246e-05, "loss": -0.0038, "reward": 1.044791692495346, "reward_std": 0.1145276602357626, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 649.4000183105469, "epoch": 0.15106417026724275, "grad_norm": 0.07973285764455795, "kl": 0.13310433998703958, "learning_rate": 1.9842529504967522e-05, "loss": -0.0014, "reward": 1.0661458551883698, "reward_std": 0.08744059428572655, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958492279053, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 631.7521057128906, "epoch": 0.15138422147543606, "grad_norm": 0.07526786625385284, "kl": 0.1988142393529415, "learning_rate": 1.9840547812524692e-05, "loss": 0.0236, "reward": 1.0739583671092987, "reward_std": 0.10084521546959876, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 642.5104370117188, "epoch": 0.15170427268362938, "grad_norm": 0.10696788132190704, "kl": 0.20733307376503946, "learning_rate": 1.9838553828796977e-05, "loss": 0.0667, "reward": 1.1098958671092987, "reward_std": 0.1374841509386897, "rewards/accuracy_reward": 0.13750000540167093, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958432674408, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 691.658349609375, "epoch": 0.1520243238918227, "grad_norm": 0.08135160803794861, "kl": 0.13316810727119446, "learning_rate": 1.9836547556274954e-05, "loss": 0.0229, "reward": 1.1130208492279052, "reward_std": 0.07824347745627165, "rewards/accuracy_reward": 0.1270833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 714.3812805175781, "epoch": 0.152344375100016, "grad_norm": 0.04559599235653877, "kl": 0.12910185605287552, "learning_rate": 1.9834528997464543e-05, "loss": 0.0007, "reward": 0.9895833492279053, "reward_std": 0.05833333432674408, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987500011920929, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 711.7854309082031, "epoch": 0.1526644263082093, "grad_norm": 0.04492470622062683, "kl": 0.42608394622802737, "learning_rate": 1.983249815488702e-05, "loss": 0.0006, "reward": 1.1776041865348816, "reward_std": 0.08468389138579369, "rewards/accuracy_reward": 0.1895833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208432674408, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 693.3146179199218, "epoch": 0.15298447751640262, "grad_norm": 0.11721441149711609, "kl": 0.16851173639297484, "learning_rate": 1.9830455031078994e-05, "loss": 0.0135, "reward": 1.0994791984558105, "reward_std": 0.09610393829643726, "rewards/accuracy_reward": 0.11875000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 718.1646057128906, "epoch": 0.15330452872459593, "grad_norm": 0.06621968746185303, "kl": 0.16904096342623234, "learning_rate": 1.9828399628592415e-05, "loss": 0.019, "reward": 1.0234375178813935, "reward_std": 0.08933095633983612, "rewards/accuracy_reward": 0.041666668839752674, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708373069763, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 676.7208557128906, "epoch": 0.15362457993278925, "grad_norm": 0.04756326973438263, "kl": 0.1703629747033119, "learning_rate": 1.982633194999458e-05, "loss": 0.0217, "reward": 1.0296875178813933, "reward_std": 0.12128421142697335, "rewards/accuracy_reward": 0.04583333414047956, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 705.4875244140625, "epoch": 0.15394463114098256, "grad_norm": 0.05060265213251114, "kl": 0.14812782481312753, "learning_rate": 1.982425199786811e-05, "loss": 0.0175, "reward": 0.9734375178813934, "reward_std": 0.0796633617952466, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 673.8708526611329, "epoch": 0.15426468234917587, "grad_norm": 0.05663428083062172, "kl": 0.11479860544204712, "learning_rate": 1.982215977481096e-05, "loss": 0.0233, "reward": 1.0630208492279052, "reward_std": 0.1249284602701664, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041686534882, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 654.2604431152344, "epoch": 0.15458473355736918, "grad_norm": 0.04875878244638443, "kl": 0.15140649005770684, "learning_rate": 1.9820055283436405e-05, "loss": 0.0177, "reward": 1.0848958671092988, "reward_std": 0.13879981879144906, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 684.2354248046875, "epoch": 0.1549047847655625, "grad_norm": 0.19684112071990967, "kl": 0.12192679718136787, "learning_rate": 1.981793852637305e-05, "loss": 0.012, "reward": 1.0083333671092987, "reward_std": 0.13445462454110385, "rewards/accuracy_reward": 0.033333333767950536, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 677.3979370117188, "epoch": 0.1552248359737558, "grad_norm": 0.0429503507912159, "kl": 0.07962028235197068, "learning_rate": 1.9815809506264822e-05, "loss": 0.0169, "reward": 1.1307291984558105, "reward_std": 0.10726220346987247, "rewards/accuracy_reward": 0.15000000800937413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 700.1291931152343, "epoch": 0.15554488718194912, "grad_norm": 0.1666691154241562, "kl": 0.3849435657262802, "learning_rate": 1.9813668225770963e-05, "loss": 0.0189, "reward": 0.9781250059604645, "reward_std": 0.14107858017086983, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.959375011920929, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 645.3687744140625, "epoch": 0.15586493839014243, "grad_norm": 0.06179690733551979, "kl": 0.11919120997190476, "learning_rate": 1.981151468756603e-05, "loss": 0.0271, "reward": 1.0656250357627868, "reward_std": 0.07652283795177936, "rewards/accuracy_reward": 0.07916666902601718, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583492279053, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 695.7000183105469, "epoch": 0.15618498959833574, "grad_norm": 0.10747893154621124, "kl": 0.18021718934178352, "learning_rate": 1.9809348894339878e-05, "loss": 0.0037, "reward": 1.0776042103767396, "reward_std": 0.13260272592306138, "rewards/accuracy_reward": 0.10208333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 677.889599609375, "epoch": 0.15650504080652905, "grad_norm": 0.05964363366365433, "kl": 0.2115800127387047, "learning_rate": 1.9807170848797693e-05, "loss": 0.0441, "reward": 1.0671875417232513, "reward_std": 0.14080357179045677, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 729.8416870117187, "epoch": 0.15682509201472236, "grad_norm": 0.05163416638970375, "kl": 0.11210698634386063, "learning_rate": 1.980498055365994e-05, "loss": 0.0233, "reward": 1.0614583611488342, "reward_std": 0.11770116221159696, "rewards/accuracy_reward": 0.08750000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583551883697, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 724.8958618164063, "epoch": 0.15714514322291567, "grad_norm": 0.06177474930882454, "kl": 0.19536799862980841, "learning_rate": 1.9802778011662406e-05, "loss": 0.0382, "reward": 1.037500023841858, "reward_std": 0.1432920940220356, "rewards/accuracy_reward": 0.06875000111758708, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500178813935, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 728.8104431152344, "epoch": 0.15746519443110898, "grad_norm": 0.07783558964729309, "kl": 0.4894620396196842, "learning_rate": 1.980056322555616e-05, "loss": 0.0228, "reward": 0.9973958551883697, "reward_std": 0.16165089309215547, "rewards/accuracy_reward": 0.043750002048909664, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9536458611488342, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 691.7229370117187, "epoch": 0.1577852456393023, "grad_norm": 0.04840158671140671, "kl": 0.1310286693274975, "learning_rate": 1.9798336198107567e-05, "loss": 0.0466, "reward": 1.143229216337204, "reward_std": 0.16915795914828777, "rewards/accuracy_reward": 0.1791666742414236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9640625178813934, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 710.3750183105469, "epoch": 0.1581052968474956, "grad_norm": 0.07563085108995438, "kl": 0.16674732267856598, "learning_rate": 1.979609693209829e-05, "loss": 0.0548, "reward": 1.014062523841858, "reward_std": 0.19497016742825507, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791805744172, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 643.0791870117188, "epoch": 0.15842534805568892, "grad_norm": 0.07467242330312729, "kl": 0.22235333174467087, "learning_rate": 1.9793845430325263e-05, "loss": 0.0294, "reward": 1.0197916865348815, "reward_std": 0.16965479105710984, "rewards/accuracy_reward": 0.04791666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 677.2750244140625, "epoch": 0.15874539926388223, "grad_norm": 0.06882388889789581, "kl": 0.171281161904335, "learning_rate": 1.9791581695600722e-05, "loss": 0.0324, "reward": 1.0578125298023224, "reward_std": 0.15982217490673065, "rewards/accuracy_reward": 0.08750000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 697.9937683105469, "epoch": 0.15906545047207554, "grad_norm": 0.11645076423883438, "kl": 0.20202036798000336, "learning_rate": 1.9789305730752167e-05, "loss": 0.0403, "reward": 1.0416666924953462, "reward_std": 0.13704411685466766, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 668.1021057128906, "epoch": 0.15938550168026885, "grad_norm": 0.06207313388586044, "kl": 0.2349511541426182, "learning_rate": 1.978701753862238e-05, "loss": 0.0339, "reward": 1.0593750298023223, "reward_std": 0.1413394134491682, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083551883697, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 714.4166870117188, "epoch": 0.15970555288846217, "grad_norm": 0.058839015662670135, "kl": 0.28067911267280576, "learning_rate": 1.9784717122069425e-05, "loss": 0.025, "reward": 1.0302083551883698, "reward_std": 0.17109771873801946, "rewards/accuracy_reward": 0.06458333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 648.9312744140625, "epoch": 0.16002560409665548, "grad_norm": 0.1544368416070938, "kl": 0.24343355521559715, "learning_rate": 1.978240448396661e-05, "loss": 0.042, "reward": 1.090625023841858, "reward_std": 0.16572601571679116, "rewards/accuracy_reward": 0.12708333656191825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416865348816, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 688.120849609375, "epoch": 0.1603456553048488, "grad_norm": 0.0498308427631855, "kl": 0.10392797477543354, "learning_rate": 1.9780079627202534e-05, "loss": 0.0175, "reward": 1.0885416924953462, "reward_std": 0.09727377630770206, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 658.6937622070312, "epoch": 0.1606657065130421, "grad_norm": 0.1539488285779953, "kl": 0.19753100723028183, "learning_rate": 1.9777742554681044e-05, "loss": 0.0408, "reward": 1.0635416984558106, "reward_std": 0.15354348700493575, "rewards/accuracy_reward": 0.09166666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 661.1750183105469, "epoch": 0.16098575772123538, "grad_norm": 0.09290697425603867, "kl": 0.33730019479990003, "learning_rate": 1.9775393269321252e-05, "loss": 0.0218, "reward": 1.115104192495346, "reward_std": 0.12255375757813454, "rewards/accuracy_reward": 0.13750000447034835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 671.9041809082031, "epoch": 0.1613058089294287, "grad_norm": 0.2346062958240509, "kl": 0.2508824057877064, "learning_rate": 1.9773031774057515e-05, "loss": 0.0345, "reward": 1.0421875298023224, "reward_std": 0.2036103442311287, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208492279053, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 625.0000213623047, "epoch": 0.161625860137622, "grad_norm": 0.11217735707759857, "kl": 0.1789027236402035, "learning_rate": 1.9770658071839448e-05, "loss": 0.0414, "reward": 1.0046875119209289, "reward_std": 0.1331900667399168, "rewards/accuracy_reward": 0.02916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208373069764, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 656.3062683105469, "epoch": 0.16194591134581532, "grad_norm": 0.062012333422899246, "kl": 0.26196385324001314, "learning_rate": 1.976827216563191e-05, "loss": 0.0301, "reward": 1.0640625178813934, "reward_std": 0.12917129397392274, "rewards/accuracy_reward": 0.08750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625059604645, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 683.8312622070313, "epoch": 0.16226596255400863, "grad_norm": 0.06038212776184082, "kl": 0.14489686340093613, "learning_rate": 1.9765874058415013e-05, "loss": 0.0513, "reward": 1.046875011920929, "reward_std": 0.10234173312783242, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 635.5646118164062, "epoch": 0.16258601376220194, "grad_norm": 0.08867309987545013, "kl": 0.30611826479434967, "learning_rate": 1.9763463753184092e-05, "loss": 0.0301, "reward": 1.055729192495346, "reward_std": 0.11597478222101927, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 662.7104339599609, "epoch": 0.16290606497039525, "grad_norm": 0.09610988944768906, "kl": 0.25172789543867113, "learning_rate": 1.9761041252949725e-05, "loss": 0.0491, "reward": 1.045312511920929, "reward_std": 0.13255398720502853, "rewards/accuracy_reward": 0.08333333432674409, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791746139527, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 671.8375183105469, "epoch": 0.16322611617858857, "grad_norm": 0.07011920213699341, "kl": 0.2210270531475544, "learning_rate": 1.975860656073773e-05, "loss": 0.0615, "reward": 1.0250000178813934, "reward_std": 0.1721810780465603, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333432674408, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 674.189599609375, "epoch": 0.16354616738678188, "grad_norm": 0.08514299988746643, "kl": 0.3713012598454952, "learning_rate": 1.9756159679589143e-05, "loss": 0.0475, "reward": 0.9869791805744171, "reward_std": 0.1320252813398838, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9536458432674408, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 674.545849609375, "epoch": 0.1638662185949752, "grad_norm": 0.14546971023082733, "kl": 0.31686792969703675, "learning_rate": 1.9753700612560228e-05, "loss": 0.0723, "reward": 1.0312500298023224, "reward_std": 0.19421770237386227, "rewards/accuracy_reward": 0.08750000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.943750011920929, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 647.0000183105469, "epoch": 0.1641862698031685, "grad_norm": 0.11060936003923416, "kl": 0.23016732260584832, "learning_rate": 1.9751229362722467e-05, "loss": 0.0717, "reward": 1.0494791865348816, "reward_std": 0.12477188017219305, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 647.9833557128907, "epoch": 0.1645063210113618, "grad_norm": 0.1363779753446579, "kl": 0.27407467886805537, "learning_rate": 1.974874593316257e-05, "loss": 0.0754, "reward": 1.0812500238418579, "reward_std": 0.17592179030179977, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9479166746139527, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 663.7750244140625, "epoch": 0.16482637221955512, "grad_norm": 0.23831957578659058, "kl": 0.25771130472421644, "learning_rate": 1.9746250326982444e-05, "loss": 0.0599, "reward": 1.0421875298023224, "reward_std": 0.16807909309864044, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9484375178813934, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 642.4479370117188, "epoch": 0.16514642342774843, "grad_norm": 0.1442326009273529, "kl": 0.2700896874070168, "learning_rate": 1.9743742547299213e-05, "loss": 0.0615, "reward": 1.0427083492279052, "reward_std": 0.15232707187533379, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.959375011920929, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 650.9854370117188, "epoch": 0.16546647463594175, "grad_norm": 0.6016630530357361, "kl": 0.3060616210103035, "learning_rate": 1.974122259724521e-05, "loss": 0.0772, "reward": 1.001562523841858, "reward_std": 0.17864162977784873, "rewards/accuracy_reward": 0.04375000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125059604645, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 667.845849609375, "epoch": 0.16578652584413506, "grad_norm": 0.15949472784996033, "kl": 0.39566795006394384, "learning_rate": 1.9738690479967964e-05, "loss": 0.0775, "reward": 1.064062523841858, "reward_std": 0.160395810008049, "rewards/accuracy_reward": 0.11250000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.951562511920929, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 701.4521057128907, "epoch": 0.16610657705232837, "grad_norm": 0.32095056772232056, "kl": 0.6456972368061542, "learning_rate": 1.9736146198630207e-05, "loss": 0.0871, "reward": 0.9614583492279053, "reward_std": 0.23185053169727327, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.909375011920929, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 693.208349609375, "epoch": 0.16642662826052168, "grad_norm": 0.18055865168571472, "kl": 0.4101051360368729, "learning_rate": 1.973358975640985e-05, "loss": 0.0719, "reward": 0.979687511920929, "reward_std": 0.2121795818209648, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9296875119209289, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 684.1354309082031, "epoch": 0.166746679468715, "grad_norm": 0.23663505911827087, "kl": 0.4086972147226334, "learning_rate": 1.9731021156500015e-05, "loss": 0.125, "reward": 0.9437500238418579, "reward_std": 0.2564606711268425, "rewards/accuracy_reward": 0.031250000558793546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9125000178813935, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 684.6520935058594, "epoch": 0.1670667306769083, "grad_norm": 0.294890433549881, "kl": 0.3842778980731964, "learning_rate": 1.972844040210899e-05, "loss": 0.1195, "reward": 0.8927083373069763, "reward_std": 0.26684831380844115, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8822916746139526, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 721.1958557128906, "epoch": 0.16738678188510162, "grad_norm": 0.36491721868515015, "kl": 0.6396187901496887, "learning_rate": 1.9725847496460256e-05, "loss": 0.1289, "reward": 0.8583333551883697, "reward_std": 0.29743548184633256, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8187500178813935, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 710.1604370117187, "epoch": 0.16770683309329493, "grad_norm": 0.4135096073150635, "kl": 0.605698075890541, "learning_rate": 1.9723242442792473e-05, "loss": 0.1082, "reward": 0.8958333551883697, "reward_std": 0.3122894302010536, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8250000178813934, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 688.0000183105469, "epoch": 0.16802688430148824, "grad_norm": 0.39389896392822266, "kl": 0.43594875633716584, "learning_rate": 1.972062524435946e-05, "loss": 0.1242, "reward": 0.9848958611488342, "reward_std": 0.28320216238498686, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8723958492279053, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 669.5000244140625, "epoch": 0.16834693550968155, "grad_norm": 0.27188587188720703, "kl": 0.47881949692964554, "learning_rate": 1.9717995904430224e-05, "loss": 0.1263, "reward": 0.9119791865348816, "reward_std": 0.22102243602275848, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9057291865348815, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 636.5646057128906, "epoch": 0.16866698671787486, "grad_norm": 0.19776694476604462, "kl": 0.42077905088663103, "learning_rate": 1.9715354426288923e-05, "loss": 0.1345, "reward": 0.993229192495346, "reward_std": 0.22253528684377671, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9140625178813935, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 652.664599609375, "epoch": 0.16898703792606817, "grad_norm": 0.2069554179906845, "kl": 0.3527437448501587, "learning_rate": 1.971270081323488e-05, "loss": 0.0745, "reward": 1.058854204416275, "reward_std": 0.2211466073989868, "rewards/accuracy_reward": 0.11458333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9442708432674408, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 666.9166870117188, "epoch": 0.16930708913426148, "grad_norm": 0.14599734544754028, "kl": 0.2297988161444664, "learning_rate": 1.9710035068582586e-05, "loss": 0.0446, "reward": 1.0453125357627868, "reward_std": 0.18706899434328078, "rewards/accuracy_reward": 0.07708333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 712.2750244140625, "epoch": 0.1696271403424548, "grad_norm": 0.1098317950963974, "kl": 0.262774883210659, "learning_rate": 1.9707357195661663e-05, "loss": 0.0559, "reward": 1.0963541865348816, "reward_std": 0.1620855674147606, "rewards/accuracy_reward": 0.13125000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 697.2083557128906, "epoch": 0.1699471915506481, "grad_norm": 0.11071398854255676, "kl": 0.1833828866481781, "learning_rate": 1.9704667197816906e-05, "loss": 0.0336, "reward": 1.0375000357627868, "reward_std": 0.15114332139492034, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 655.6458465576172, "epoch": 0.17026724275884142, "grad_norm": 0.04358408972620964, "kl": 0.14082614332437515, "learning_rate": 1.970196507840823e-05, "loss": 0.0266, "reward": 1.1223958730697632, "reward_std": 0.11901317611336708, "rewards/accuracy_reward": 0.1395833373069763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125178813935, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 667.9833618164063, "epoch": 0.17058729396703473, "grad_norm": 0.07485391199588776, "kl": 0.16539901047945021, "learning_rate": 1.9699250840810714e-05, "loss": 0.0206, "reward": 0.9927083492279053, "reward_std": 0.09257282391190529, "rewards/accuracy_reward": 0.014583333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 657.2625183105469, "epoch": 0.17090734517522804, "grad_norm": 0.07787440717220306, "kl": 0.14856073185801505, "learning_rate": 1.969652448841456e-05, "loss": 0.0205, "reward": 1.1171875238418578, "reward_std": 0.09560411293059587, "rewards/accuracy_reward": 0.1312500059604645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375178813934, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 682.5000305175781, "epoch": 0.17122739638342135, "grad_norm": 0.05641574785113335, "kl": 0.16644731312990188, "learning_rate": 1.9693786024625097e-05, "loss": 0.0178, "reward": 1.0593750178813934, "reward_std": 0.10782541166990996, "rewards/accuracy_reward": 0.0770833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 659.7396057128906, "epoch": 0.17154744759161467, "grad_norm": 0.1265953779220581, "kl": 0.16372175514698029, "learning_rate": 1.9691035452862798e-05, "loss": 0.028, "reward": 1.0588541984558106, "reward_std": 0.13097876124083996, "rewards/accuracy_reward": 0.0812500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.977604192495346, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 698.2125183105469, "epoch": 0.17186749879980798, "grad_norm": 0.07919764518737793, "kl": 0.15723595917224883, "learning_rate": 1.9688272776563248e-05, "loss": 0.0262, "reward": 1.0427083611488341, "reward_std": 0.12501449398696424, "rewards/accuracy_reward": 0.05833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 662.214599609375, "epoch": 0.1721875500080013, "grad_norm": 0.038847651332616806, "kl": 0.128957362473011, "learning_rate": 1.968549799917715e-05, "loss": 0.0213, "reward": 1.0822916984558106, "reward_std": 0.11970577985048295, "rewards/accuracy_reward": 0.10000000465661288, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 653.0750244140625, "epoch": 0.1725076012161946, "grad_norm": 0.04241522029042244, "kl": 0.10394577831029891, "learning_rate": 1.9682711124170325e-05, "loss": 0.0061, "reward": 1.0479166805744171, "reward_std": 0.10726981312036514, "rewards/accuracy_reward": 0.060416667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9875000059604645, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 674.6229370117187, "epoch": 0.1728276524243879, "grad_norm": 0.03727027401328087, "kl": 0.11779571026563644, "learning_rate": 1.9679912155023713e-05, "loss": 0.0261, "reward": 1.0494791984558105, "reward_std": 0.1118181511759758, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791746139527, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 710.9666870117187, "epoch": 0.17314770363258122, "grad_norm": 0.05394143611192703, "kl": 0.12853079214692115, "learning_rate": 1.9677101095233342e-05, "loss": 0.0488, "reward": 1.0333333671092988, "reward_std": 0.17711131498217583, "rewards/accuracy_reward": 0.06250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 680.5291809082031, "epoch": 0.17346775484077454, "grad_norm": 0.05786094814538956, "kl": 0.12042107433080673, "learning_rate": 1.9674277948310355e-05, "loss": 0.0222, "reward": 1.0182291924953462, "reward_std": 0.10973568204790354, "rewards/accuracy_reward": 0.04166666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625059604645, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 713.570849609375, "epoch": 0.17378780604896785, "grad_norm": 0.09852173924446106, "kl": 0.108553709089756, "learning_rate": 1.9671442717780992e-05, "loss": 0.0338, "reward": 1.0500000357627868, "reward_std": 0.0853736650198698, "rewards/accuracy_reward": 0.06875000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 759.470849609375, "epoch": 0.17410785725716116, "grad_norm": 0.045692119747400284, "kl": 0.21100176870822906, "learning_rate": 1.966859540718658e-05, "loss": 0.0341, "reward": 1.0578125298023224, "reward_std": 0.12945474069565535, "rewards/accuracy_reward": 0.08541666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958432674408, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 707.0541870117188, "epoch": 0.17442790846535447, "grad_norm": 0.2436332404613495, "kl": 0.1932619445025921, "learning_rate": 1.9665736020083533e-05, "loss": 0.0672, "reward": 1.1473958611488342, "reward_std": 0.14116937890648842, "rewards/accuracy_reward": 0.18125000596046448, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 678.1062683105469, "epoch": 0.17474795967354778, "grad_norm": 0.08150272816419601, "kl": 0.1950148455798626, "learning_rate": 1.9662864560043364e-05, "loss": 0.066, "reward": 0.9979166865348816, "reward_std": 0.12375719584524632, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166865348815, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 715.6021118164062, "epoch": 0.17506801088174107, "grad_norm": 0.06785853207111359, "kl": 0.1598280780017376, "learning_rate": 1.9659981030652648e-05, "loss": 0.0494, "reward": 0.9994791865348815, "reward_std": 0.1482737548649311, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958492279052, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 705.9791931152344, "epoch": 0.17538806208993438, "grad_norm": 0.0760614350438118, "kl": 0.15669554099440575, "learning_rate": 1.9657085435513043e-05, "loss": 0.0302, "reward": 0.987500011920929, "reward_std": 0.09059235211461783, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 646.0375183105468, "epoch": 0.1757081132981277, "grad_norm": 0.055525023490190506, "kl": 0.15209799632430077, "learning_rate": 1.9654177778241278e-05, "loss": 0.0474, "reward": 1.1031250298023223, "reward_std": 0.16906238086521624, "rewards/accuracy_reward": 0.1333333384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 682.064599609375, "epoch": 0.176028164506321, "grad_norm": 0.11843759566545486, "kl": 0.14061254411935806, "learning_rate": 1.965125806246915e-05, "loss": 0.0696, "reward": 1.0270833551883698, "reward_std": 0.1306549172848463, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 685.5396118164062, "epoch": 0.1763482157145143, "grad_norm": 0.0710177794098854, "kl": 0.1787277102470398, "learning_rate": 1.9648326291843505e-05, "loss": 0.0245, "reward": 1.0244791865348817, "reward_std": 0.08614383526146412, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 662.0729309082031, "epoch": 0.17666826692270762, "grad_norm": 0.18198657035827637, "kl": 0.2556367240846157, "learning_rate": 1.9645382470026267e-05, "loss": 0.0487, "reward": 1.0348958432674409, "reward_std": 0.16033005770295858, "rewards/accuracy_reward": 0.075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958432674408, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 681.495849609375, "epoch": 0.17698831813090093, "grad_norm": 0.04028153792023659, "kl": 0.10996652320027352, "learning_rate": 1.9642426600694395e-05, "loss": 0.0267, "reward": 1.0442708551883697, "reward_std": 0.13642770163714885, "rewards/accuracy_reward": 0.07083333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 731.7521057128906, "epoch": 0.17730836933909425, "grad_norm": 0.06031052768230438, "kl": 0.13037733137607574, "learning_rate": 1.9639458687539905e-05, "loss": 0.0409, "reward": 1.0692708611488342, "reward_std": 0.1615061044692993, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708492279052, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 666.8333557128906, "epoch": 0.17762842054728756, "grad_norm": 0.08267563581466675, "kl": 0.16332112476229668, "learning_rate": 1.9636478734269854e-05, "loss": 0.0564, "reward": 0.9770833432674408, "reward_std": 0.11380629241466522, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 627.0062683105468, "epoch": 0.17794847175548087, "grad_norm": 0.13827405869960785, "kl": 0.19004355743527412, "learning_rate": 1.963348674460633e-05, "loss": 0.0621, "reward": 1.0869791865348817, "reward_std": 0.15473262146115302, "rewards/accuracy_reward": 0.1208333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 709.895849609375, "epoch": 0.17826852296367418, "grad_norm": 0.08501515537500381, "kl": 0.14372501894831657, "learning_rate": 1.9630482722286473e-05, "loss": 0.0429, "reward": 1.0583333671092987, "reward_std": 0.13740524798631668, "rewards/accuracy_reward": 0.08541666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166746139526, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 679.3437683105469, "epoch": 0.1785885741718675, "grad_norm": 0.3827410936355591, "kl": 0.22375328987836837, "learning_rate": 1.9627466671062434e-05, "loss": 0.0238, "reward": 1.1010416984558105, "reward_std": 0.11596148405224085, "rewards/accuracy_reward": 0.12916667032986878, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 637.4833557128907, "epoch": 0.1789086253800608, "grad_norm": 0.07304264605045319, "kl": 0.41341082081198693, "learning_rate": 1.9624438594701397e-05, "loss": 0.042, "reward": 1.0317708432674408, "reward_std": 0.16881751529872419, "rewards/accuracy_reward": 0.06250000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708492279053, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 664.8666931152344, "epoch": 0.17922867658825412, "grad_norm": 0.07624661922454834, "kl": 0.18999108150601388, "learning_rate": 1.9621398496985566e-05, "loss": 0.0489, "reward": 1.0333333611488342, "reward_std": 0.12361449729651212, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000119209289, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 669.8458557128906, "epoch": 0.17954872779644743, "grad_norm": 0.07759030163288116, "kl": 0.18845606744289398, "learning_rate": 1.9618346381712163e-05, "loss": 0.0296, "reward": 1.015625011920929, "reward_std": 0.11664673164486886, "rewards/accuracy_reward": 0.037500002235174176, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 685.6666809082031, "epoch": 0.17986877900464074, "grad_norm": 0.08038783073425293, "kl": 0.14941411837935448, "learning_rate": 1.9615282252693407e-05, "loss": 0.04, "reward": 1.0671875298023223, "reward_std": 0.1277464386075735, "rewards/accuracy_reward": 0.0937500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 654.0541870117188, "epoch": 0.18018883021283405, "grad_norm": 0.061528515070676804, "kl": 0.23907251805067062, "learning_rate": 1.9612206113756536e-05, "loss": 0.0627, "reward": 1.0000000238418578, "reward_std": 0.1499695971608162, "rewards/accuracy_reward": 0.039583335630595684, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166805744171, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 636.6979370117188, "epoch": 0.18050888142102736, "grad_norm": 0.11629898101091385, "kl": 0.22134559452533722, "learning_rate": 1.9609117968743794e-05, "loss": 0.0883, "reward": 1.0432291865348815, "reward_std": 0.15978550240397454, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 688.502099609375, "epoch": 0.18082893262922067, "grad_norm": 0.25423702597618103, "kl": 0.21465194523334502, "learning_rate": 1.9606017821512405e-05, "loss": 0.067, "reward": 0.9765625059604645, "reward_std": 0.1502749115228653, "rewards/accuracy_reward": 0.01666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958373069764, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 692.4937744140625, "epoch": 0.18114898383741398, "grad_norm": 0.16149629652500153, "kl": 0.3055619314312935, "learning_rate": 1.960290567593459e-05, "loss": 0.0905, "reward": 0.9885416865348816, "reward_std": 0.18179295882582663, "rewards/accuracy_reward": 0.04583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083492279053, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 666.0479309082032, "epoch": 0.1814690350456073, "grad_norm": 0.25656628608703613, "kl": 0.4261516511440277, "learning_rate": 1.9599781535897562e-05, "loss": 0.1483, "reward": 0.9687500059604645, "reward_std": 0.21279908418655397, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9208333432674408, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 696.0646057128906, "epoch": 0.1817890862538006, "grad_norm": 0.25229644775390625, "kl": 0.43460706919431685, "learning_rate": 1.9596645405303508e-05, "loss": 0.1277, "reward": 0.994791692495346, "reward_std": 0.23564037531614304, "rewards/accuracy_reward": 0.06666666734963655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.928125011920929, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 711.927099609375, "epoch": 0.18210913746199392, "grad_norm": 0.4171068072319031, "kl": 0.6513818740844727, "learning_rate": 1.9593497288069603e-05, "loss": 0.1526, "reward": 0.9005208492279053, "reward_std": 0.24727483838796616, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8963541805744171, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 632.3687713623046, "epoch": 0.18242918867018723, "grad_norm": 0.2483467310667038, "kl": 0.7798386961221695, "learning_rate": 1.9590337188127978e-05, "loss": 0.1642, "reward": 1.012500023841858, "reward_std": 0.27335314750671386, "rewards/accuracy_reward": 0.11666666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8958333432674408, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 681.7146057128906, "epoch": 0.18274923987838054, "grad_norm": 0.3533123731613159, "kl": 0.6377503961324692, "learning_rate": 1.9587165109425746e-05, "loss": 0.1724, "reward": 0.9296875298023224, "reward_std": 0.2813147783279419, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8776041865348816, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 652.0479431152344, "epoch": 0.18306929108657385, "grad_norm": 0.21204252541065216, "kl": 0.408619812130928, "learning_rate": 1.9583981055924966e-05, "loss": 0.1507, "reward": 0.9921875178813935, "reward_std": 0.24514594674110413, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.885937511920929, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 710.6229370117187, "epoch": 0.18338934229476717, "grad_norm": 0.26197925209999084, "kl": 0.49179228246212003, "learning_rate": 1.9580785031602673e-05, "loss": 0.1293, "reward": 0.9187500178813934, "reward_std": 0.27653331905603407, "rewards/accuracy_reward": 0.058333336375653744, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8604166805744171, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 667.7625183105469, "epoch": 0.18370939350296048, "grad_norm": 0.3436623513698578, "kl": 0.43792471289634705, "learning_rate": 1.9577577040450842e-05, "loss": 0.1387, "reward": 0.9255208611488343, "reward_std": 0.22069956436753274, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9151041805744171, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 683.5166870117188, "epoch": 0.1840294447111538, "grad_norm": 0.38407763838768005, "kl": 0.5805969923734665, "learning_rate": 1.9574357086476398e-05, "loss": 0.1509, "reward": 0.9453125178813935, "reward_std": 0.2484004467725754, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8973958432674408, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 669.6416931152344, "epoch": 0.1843494959193471, "grad_norm": 0.3657831847667694, "kl": 1.5230638265609742, "learning_rate": 1.95711251737012e-05, "loss": 0.1762, "reward": 0.9635416865348816, "reward_std": 0.23155898600816727, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8968750178813935, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 706.039599609375, "epoch": 0.1846695471275404, "grad_norm": 0.2620721757411957, "kl": 0.5607627764344215, "learning_rate": 1.9567881306162065e-05, "loss": 0.1382, "reward": 1.0312500178813935, "reward_std": 0.24164563566446304, "rewards/accuracy_reward": 0.11458333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9166666746139527, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 691.7416931152344, "epoch": 0.18498959833573372, "grad_norm": 0.3571402132511139, "kl": 0.5915179625153542, "learning_rate": 1.956462548791072e-05, "loss": 0.1009, "reward": 0.9963541805744172, "reward_std": 0.24583946019411088, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9234375059604645, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 680.7125183105469, "epoch": 0.18530964954392704, "grad_norm": 0.26667729020118713, "kl": 0.4023525446653366, "learning_rate": 1.9561357723013827e-05, "loss": 0.1324, "reward": 1.0218750298023225, "reward_std": 0.21049386411905288, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9260416805744172, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 695.9666931152344, "epoch": 0.18562970075212035, "grad_norm": 0.15235862135887146, "kl": 0.41516488790512085, "learning_rate": 1.9558078015552973e-05, "loss": 0.0822, "reward": 0.9942708492279053, "reward_std": 0.21924960762262344, "rewards/accuracy_reward": 0.05416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9401041746139527, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 656.2229370117187, "epoch": 0.18594975196031366, "grad_norm": 0.16361404955387115, "kl": 0.3572592079639435, "learning_rate": 1.9554786369624666e-05, "loss": 0.1157, "reward": 1.0755208492279054, "reward_std": 0.23814705833792688, "rewards/accuracy_reward": 0.13541667088866233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9401041746139527, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 644.3979370117188, "epoch": 0.18626980316850697, "grad_norm": 0.2164674550294876, "kl": 0.3529484748840332, "learning_rate": 1.9551482789340308e-05, "loss": 0.0896, "reward": 1.0656250357627868, "reward_std": 0.1942263960838318, "rewards/accuracy_reward": 0.1104166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083492279053, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 688.3125244140625, "epoch": 0.18658985437670028, "grad_norm": 0.13018456101417542, "kl": 0.40574545711278914, "learning_rate": 1.9548167278826224e-05, "loss": 0.0848, "reward": 1.0317708492279052, "reward_std": 0.14726283103227616, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875119209289, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 750.0979431152343, "epoch": 0.1869099055848936, "grad_norm": 0.1716415286064148, "kl": 0.30623201876878736, "learning_rate": 1.9544839842223636e-05, "loss": 0.0671, "reward": 1.0182291865348816, "reward_std": 0.17932818606495857, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 698.2916870117188, "epoch": 0.1872299567930869, "grad_norm": 0.23720037937164307, "kl": 0.29293742030858994, "learning_rate": 1.9541500483688663e-05, "loss": 0.1013, "reward": 1.0354166865348815, "reward_std": 0.19150078296661377, "rewards/accuracy_reward": 0.08958333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333373069763, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 757.683349609375, "epoch": 0.18755000800128022, "grad_norm": 0.08887584507465363, "kl": 0.18934873640537261, "learning_rate": 1.9538149207392306e-05, "loss": 0.0544, "reward": 0.9760416746139526, "reward_std": 0.0908809632062912, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583373069763, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 721.4979309082031, "epoch": 0.18787005920947353, "grad_norm": 0.07203318923711777, "kl": 0.24687618166208267, "learning_rate": 1.9534786017520466e-05, "loss": 0.0771, "reward": 1.0692708551883698, "reward_std": 0.16076359152793884, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041746139526, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 739.3646057128906, "epoch": 0.18819011041766684, "grad_norm": 0.27686551213264465, "kl": 0.5672411054372788, "learning_rate": 1.9531410918273915e-05, "loss": 0.0731, "reward": 0.9807291924953461, "reward_std": 0.2045274019241333, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9203125059604644, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 728.4708618164062, "epoch": 0.18851016162586015, "grad_norm": 0.18232998251914978, "kl": 0.3686387039721012, "learning_rate": 1.9528023913868305e-05, "loss": 0.0819, "reward": 0.9927083611488342, "reward_std": 0.16963814198970795, "rewards/accuracy_reward": 0.03958333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250059604645, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 690.6479370117188, "epoch": 0.18883021283405343, "grad_norm": 0.10632241517305374, "kl": 0.3312704361975193, "learning_rate": 1.9524625008534153e-05, "loss": 0.0953, "reward": 1.021875011920929, "reward_std": 0.1697681626304984, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250119209289, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 703.708349609375, "epoch": 0.18915026404224675, "grad_norm": 0.05055807903409004, "kl": 0.2949476674199104, "learning_rate": 1.9521214206516845e-05, "loss": 0.0396, "reward": 0.9864583492279053, "reward_std": 0.09214165061712265, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 717.8375244140625, "epoch": 0.18947031525044006, "grad_norm": 0.16359588503837585, "kl": 0.19101431891322135, "learning_rate": 1.9517791512076628e-05, "loss": 0.0395, "reward": 1.0151041984558105, "reward_std": 0.14598013013601302, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541805744171, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 688.8396057128906, "epoch": 0.18979036645863337, "grad_norm": 0.09676505625247955, "kl": 0.188329254090786, "learning_rate": 1.95143569294886e-05, "loss": 0.0821, "reward": 0.9786458432674408, "reward_std": 0.1273002317175269, "rewards/accuracy_reward": 0.01458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.964062511920929, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 679.6104309082032, "epoch": 0.19011041766682668, "grad_norm": 0.08294422924518585, "kl": 0.1577234983444214, "learning_rate": 1.9510910463042704e-05, "loss": 0.0623, "reward": 1.005729180574417, "reward_std": 0.13024714030325413, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 738.9958618164062, "epoch": 0.19043046887502, "grad_norm": 0.07775887846946716, "kl": 0.13679290562868118, "learning_rate": 1.9507452117043736e-05, "loss": 0.0505, "reward": 1.0354166805744172, "reward_std": 0.1238449014723301, "rewards/accuracy_reward": 0.06041666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000059604644, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 657.9041870117187, "epoch": 0.1907505200832133, "grad_norm": 0.1872749775648117, "kl": 0.17498825788497924, "learning_rate": 1.950398189581132e-05, "loss": 0.0693, "reward": 1.1302083551883697, "reward_std": 0.11385347358882428, "rewards/accuracy_reward": 0.16458333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250059604645, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 733.158349609375, "epoch": 0.19107057129140662, "grad_norm": 0.07586333155632019, "kl": 0.17170817404985428, "learning_rate": 1.9500499803679925e-05, "loss": 0.0651, "reward": 0.9713541805744171, "reward_std": 0.10218179430812598, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.967187511920929, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 692.6416809082032, "epoch": 0.19139062249959993, "grad_norm": 0.08267262578010559, "kl": 0.2973045527935028, "learning_rate": 1.9497005844998835e-05, "loss": 0.065, "reward": 1.0682291984558105, "reward_std": 0.17779658660292624, "rewards/accuracy_reward": 0.12083333693444728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9473958432674408, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 728.98544921875, "epoch": 0.19171067370779324, "grad_norm": 0.04687151312828064, "kl": 0.19852605685591698, "learning_rate": 1.949350002413216e-05, "loss": 0.0557, "reward": 0.9947916805744171, "reward_std": 0.11155761461704969, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916805744172, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 656.8187683105468, "epoch": 0.19203072491598655, "grad_norm": 0.1460895538330078, "kl": 0.262048863619566, "learning_rate": 1.9489982345458832e-05, "loss": 0.121, "reward": 1.0598958492279054, "reward_std": 0.12374843284487724, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958432674408, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 675.1812683105469, "epoch": 0.19235077612417986, "grad_norm": 0.17061711847782135, "kl": 0.31913367435336115, "learning_rate": 1.9486452813372586e-05, "loss": 0.1273, "reward": 1.0255208551883697, "reward_std": 0.15627394691109658, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9401041746139527, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 699.1958557128906, "epoch": 0.19267082733237317, "grad_norm": 0.12607340514659882, "kl": 0.23554740473628044, "learning_rate": 1.9482911432281963e-05, "loss": 0.0923, "reward": 1.0270833611488341, "reward_std": 0.13728910144418477, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000119209289, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 717.2666809082032, "epoch": 0.19299087854056649, "grad_norm": 0.17443907260894775, "kl": 0.2461421586573124, "learning_rate": 1.947935820661031e-05, "loss": 0.066, "reward": 1.0833333492279054, "reward_std": 0.15556199103593826, "rewards/accuracy_reward": 0.12291667107492685, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166805744171, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 674.7812744140625, "epoch": 0.1933109297487598, "grad_norm": 0.13457360863685608, "kl": 0.29405288547277453, "learning_rate": 1.947579314079577e-05, "loss": 0.0852, "reward": 0.9744791865348816, "reward_std": 0.16201976984739302, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9348958492279053, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 656.6604309082031, "epoch": 0.1936309809569531, "grad_norm": 0.13644298911094666, "kl": 0.29756073504686353, "learning_rate": 1.9472216239291256e-05, "loss": 0.1072, "reward": 0.9921875178813935, "reward_std": 0.15694627091288565, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9484375059604645, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 707.0354309082031, "epoch": 0.19395103216514642, "grad_norm": 0.09973659366369247, "kl": 0.2315961815416813, "learning_rate": 1.946862750656449e-05, "loss": 0.0682, "reward": 1.0463541805744172, "reward_std": 0.17754550725221635, "rewards/accuracy_reward": 0.10000000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541746139527, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 702.8583557128907, "epoch": 0.19427108337333973, "grad_norm": 0.11070112884044647, "kl": 0.3192076399922371, "learning_rate": 1.946502694709796e-05, "loss": 0.1039, "reward": 1.0281250178813934, "reward_std": 0.19837626814842224, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9322916805744171, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 648.0583557128906, "epoch": 0.19459113458153304, "grad_norm": 0.1061927005648613, "kl": 0.30815952718257905, "learning_rate": 1.9461414565388917e-05, "loss": 0.1082, "reward": 1.0973958492279052, "reward_std": 0.21912664771080018, "rewards/accuracy_reward": 0.15416667331010103, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9432291746139526, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 687.2354370117188, "epoch": 0.19491118578972635, "grad_norm": 0.17305564880371094, "kl": 0.31731778383255005, "learning_rate": 1.9457790365949395e-05, "loss": 0.0975, "reward": 1.020312535762787, "reward_std": 0.18112142533063888, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9348958492279053, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 657.0562622070313, "epoch": 0.19523123699791967, "grad_norm": 0.17710889875888824, "kl": 0.3631772108376026, "learning_rate": 1.945415435330618e-05, "loss": 0.1042, "reward": 0.9520833492279053, "reward_std": 0.16206833571195603, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333551883698, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 722.6562683105469, "epoch": 0.19555128820611298, "grad_norm": 0.1479700654745102, "kl": 0.531545577943325, "learning_rate": 1.945050653200081e-05, "loss": 0.1065, "reward": 0.9593750238418579, "reward_std": 0.23627854734659196, "rewards/accuracy_reward": 0.043750001676380636, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9156250119209289, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 741.0146118164063, "epoch": 0.1958713394143063, "grad_norm": 0.3069762885570526, "kl": 0.7132264107465744, "learning_rate": 1.9446846906589586e-05, "loss": 0.1666, "reward": 0.9562500238418579, "reward_std": 0.2901140823960304, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8854166805744171, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 687.283349609375, "epoch": 0.1961913906224996, "grad_norm": 0.303815633058548, "kl": 0.5620258882641792, "learning_rate": 1.9443175481643536e-05, "loss": 0.1316, "reward": 0.9885416805744172, "reward_std": 0.24952167868614197, "rewards/accuracy_reward": 0.09166666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.896875011920929, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 759.2500183105469, "epoch": 0.1965114418306929, "grad_norm": 0.10386354476213455, "kl": 0.34386143982410433, "learning_rate": 1.9439492261748438e-05, "loss": 0.0905, "reward": 1.0187500178813935, "reward_std": 0.19125093519687653, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9354166805744171, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 689.5208557128906, "epoch": 0.19683149303888622, "grad_norm": 0.16504688560962677, "kl": 0.2635188832879066, "learning_rate": 1.9435797251504797e-05, "loss": 0.1012, "reward": 1.0119791805744172, "reward_std": 0.1481368623673916, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125178813935, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 706.8146118164062, "epoch": 0.19715154424707954, "grad_norm": 0.23499853909015656, "kl": 0.4958648651838303, "learning_rate": 1.9432090455527847e-05, "loss": 0.1062, "reward": 0.9473958432674408, "reward_std": 0.22377754971385003, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9161458373069763, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 685.5250183105469, "epoch": 0.19747159545527285, "grad_norm": 0.1540578454732895, "kl": 0.22922171503305436, "learning_rate": 1.9428371878447545e-05, "loss": 0.0918, "reward": 1.160416716337204, "reward_std": 0.1973195567727089, "rewards/accuracy_reward": 0.21250000689178705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9479166805744171, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 794.7541931152343, "epoch": 0.19779164666346616, "grad_norm": 0.1357450783252716, "kl": 0.2704797863960266, "learning_rate": 1.9424641524908553e-05, "loss": 0.0411, "reward": 0.9552083492279053, "reward_std": 0.1474937668070197, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416805744171, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 722.7125183105469, "epoch": 0.19811169787165947, "grad_norm": 0.16005557775497437, "kl": 0.3636905699968338, "learning_rate": 1.942089939957026e-05, "loss": 0.0828, "reward": 0.9494791805744172, "reward_std": 0.2315253049135208, "rewards/accuracy_reward": 0.02500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9244791865348816, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 678.9395935058594, "epoch": 0.19843174907985278, "grad_norm": 0.21744054555892944, "kl": 0.5238471448421478, "learning_rate": 1.9417145507106737e-05, "loss": 0.1072, "reward": 0.9651041865348816, "reward_std": 0.21560292392969133, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.910937511920929, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 685.8021118164063, "epoch": 0.1987518002880461, "grad_norm": 0.2656537890434265, "kl": 0.6612590730190278, "learning_rate": 1.9413379852206772e-05, "loss": 0.1127, "reward": 0.9630208432674408, "reward_std": 0.1951449528336525, "rewards/accuracy_reward": 0.04166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9213541746139526, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 686.4250244140625, "epoch": 0.1990718514962394, "grad_norm": 0.17960472404956818, "kl": 0.41118341088294985, "learning_rate": 1.940960243957383e-05, "loss": 0.1284, "reward": 1.0088541805744171, "reward_std": 0.16900431141257286, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.942187511920929, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 668.5521057128906, "epoch": 0.19939190270443272, "grad_norm": 0.14084936678409576, "kl": 0.3053492411971092, "learning_rate": 1.9405813273926076e-05, "loss": 0.0959, "reward": 1.0520833551883697, "reward_std": 0.15083030611276627, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.950000011920929, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 638.0021057128906, "epoch": 0.19971195391262603, "grad_norm": 0.33902284502983093, "kl": 0.4690784841775894, "learning_rate": 1.9402012359996342e-05, "loss": 0.1339, "reward": 1.1656250417232514, "reward_std": 0.2067689336836338, "rewards/accuracy_reward": 0.2250000076368451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.940625011920929, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 687.9312683105469, "epoch": 0.20003200512081934, "grad_norm": 0.12033872306346893, "kl": 0.23370456770062448, "learning_rate": 1.9398199702532143e-05, "loss": 0.0821, "reward": 1.0390625298023224, "reward_std": 0.19397076815366746, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291746139527, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 687.9896057128906, "epoch": 0.20035205632901265, "grad_norm": 0.1661445051431656, "kl": 0.20272066816687584, "learning_rate": 1.9394375306295655e-05, "loss": 0.0791, "reward": 0.9723958492279052, "reward_std": 0.12908698618412018, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.964062511920929, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 623.8812683105468, "epoch": 0.20067210753720596, "grad_norm": 0.1688031256198883, "kl": 0.2342596873641014, "learning_rate": 1.9390539176063723e-05, "loss": 0.0951, "reward": 1.1093750178813935, "reward_std": 0.11803668811917305, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083373069764, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 624.2083557128906, "epoch": 0.20099215874539927, "grad_norm": 0.13567431271076202, "kl": 0.240685623139143, "learning_rate": 1.9386691316627845e-05, "loss": 0.0907, "reward": 1.084375023841858, "reward_std": 0.17157965078949927, "rewards/accuracy_reward": 0.11875000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250059604645, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 647.1916809082031, "epoch": 0.20131220995359259, "grad_norm": 0.11089053750038147, "kl": 0.3034446746110916, "learning_rate": 1.938283173279417e-05, "loss": 0.1056, "reward": 0.9802083611488343, "reward_std": 0.21385945081710817, "rewards/accuracy_reward": 0.03750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083432674408, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 622.4396118164062, "epoch": 0.2016322611617859, "grad_norm": 0.13379743695259094, "kl": 0.3716781333088875, "learning_rate": 1.9378960429383494e-05, "loss": 0.1141, "reward": 0.9442708432674408, "reward_std": 0.14153240323066713, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.942187511920929, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 630.175015258789, "epoch": 0.2019523123699792, "grad_norm": 0.3276001811027527, "kl": 0.3941763326525688, "learning_rate": 1.937507741123124e-05, "loss": 0.0728, "reward": 1.0151041984558105, "reward_std": 0.17675597220659256, "rewards/accuracy_reward": 0.0625000026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041865348815, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 619.908349609375, "epoch": 0.20227236357817252, "grad_norm": 0.1801009476184845, "kl": 0.5507948979735374, "learning_rate": 1.9371182683187477e-05, "loss": 0.169, "reward": 1.021354192495346, "reward_std": 0.20031024273484946, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9255208492279052, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 565.6271057128906, "epoch": 0.20259241478636583, "grad_norm": 0.25220805406570435, "kl": 0.4888093382120132, "learning_rate": 1.9367276250116894e-05, "loss": 0.1247, "reward": 1.0687500417232514, "reward_std": 0.16525894403457642, "rewards/accuracy_reward": 0.12291667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333492279053, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 636.6562652587891, "epoch": 0.20291246599455912, "grad_norm": 0.2679707705974579, "kl": 0.541820627450943, "learning_rate": 1.9363358116898804e-05, "loss": 0.1338, "reward": 1.0411458611488342, "reward_std": 0.19658421874046325, "rewards/accuracy_reward": 0.12083333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9203125238418579, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 650.052099609375, "epoch": 0.20323251720275243, "grad_norm": 0.2806706130504608, "kl": 0.48353932052850723, "learning_rate": 1.935942828842713e-05, "loss": 0.1038, "reward": 0.9880208551883698, "reward_std": 0.20751163512468337, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9338541746139526, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 614.6312683105468, "epoch": 0.20355256841094574, "grad_norm": 0.14846235513687134, "kl": 0.5935159817337989, "learning_rate": 1.93554867696104e-05, "loss": 0.1618, "reward": 0.9677083611488342, "reward_std": 0.22315222583711147, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9197916865348816, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 615.1833526611329, "epoch": 0.20387261961913905, "grad_norm": 0.1585318148136139, "kl": 0.4165584176778793, "learning_rate": 1.9351533565371747e-05, "loss": 0.1312, "reward": 1.0536458611488342, "reward_std": 0.19778771847486495, "rewards/accuracy_reward": 0.12083333693444728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.932812511920929, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 608.758349609375, "epoch": 0.20419267082733236, "grad_norm": 0.17503924667835236, "kl": 0.47458241432905196, "learning_rate": 1.9347568680648903e-05, "loss": 0.1487, "reward": 1.0119791805744172, "reward_std": 0.26864703595638273, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9244791746139527, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 608.4541809082032, "epoch": 0.20451272203552567, "grad_norm": 0.2242783159017563, "kl": 0.33436805531382563, "learning_rate": 1.9343592120394187e-05, "loss": 0.1189, "reward": 0.9973958611488343, "reward_std": 0.18330222517251968, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9432291924953461, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 607.3437683105469, "epoch": 0.20483277324371899, "grad_norm": 0.1964443325996399, "kl": 0.3340280294418335, "learning_rate": 1.9339603889574498e-05, "loss": 0.1283, "reward": 0.9588541865348816, "reward_std": 0.17630846053361893, "rewards/accuracy_reward": 0.020833334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9380208551883698, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 653.9416870117187, "epoch": 0.2051528244519123, "grad_norm": 0.15593037009239197, "kl": 0.4793477475643158, "learning_rate": 1.9335603993171318e-05, "loss": 0.1163, "reward": 0.9802083492279052, "reward_std": 0.15702517479658126, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.934375011920929, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 630.5083587646484, "epoch": 0.2054728756601056, "grad_norm": 0.1937408149242401, "kl": 0.5273372441530227, "learning_rate": 1.9331592436180698e-05, "loss": 0.1689, "reward": 0.9833333492279053, "reward_std": 0.22838717848062515, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9270833492279053, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 654.8875183105469, "epoch": 0.20579292686829892, "grad_norm": 0.22712171077728271, "kl": 0.6470546633005142, "learning_rate": 1.932756922361325e-05, "loss": 0.1601, "reward": 1.032291704416275, "reward_std": 0.2833305008709431, "rewards/accuracy_reward": 0.13333333898335695, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8989583551883698, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 645.6250122070312, "epoch": 0.20611297807649223, "grad_norm": 0.29243704676628113, "kl": 0.45784421265125275, "learning_rate": 1.932353436049414e-05, "loss": 0.1327, "reward": 0.986979192495346, "reward_std": 0.20242194682359696, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9307291805744171, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 671.0583557128906, "epoch": 0.20643302928468554, "grad_norm": 0.2416207194328308, "kl": 0.39548794478178023, "learning_rate": 1.9319487851863103e-05, "loss": 0.1223, "reward": 1.1203125298023224, "reward_std": 0.2333257243037224, "rewards/accuracy_reward": 0.1875000052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9328125059604645, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 624.8312683105469, "epoch": 0.20675308049287885, "grad_norm": 0.1228092610836029, "kl": 0.2716518625617027, "learning_rate": 1.9315429702774408e-05, "loss": 0.0677, "reward": 0.9864583551883698, "reward_std": 0.15608318373560906, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916805744172, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 642.5541870117188, "epoch": 0.20707313170107217, "grad_norm": 0.17116791009902954, "kl": 0.26060923784971235, "learning_rate": 1.9311359918296855e-05, "loss": 0.0978, "reward": 1.0656250238418579, "reward_std": 0.14626556485891343, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 697.2333557128907, "epoch": 0.20739318290926548, "grad_norm": 0.12016578763723373, "kl": 0.24654133021831512, "learning_rate": 1.9307278503513803e-05, "loss": 0.061, "reward": 1.037500023841858, "reward_std": 0.12056761756539344, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 620.2396057128906, "epoch": 0.2077132341174588, "grad_norm": 0.07504566013813019, "kl": 0.21631157025694847, "learning_rate": 1.9303185463523108e-05, "loss": 0.0896, "reward": 1.0578125178813935, "reward_std": 0.13687589354813098, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 648.6208435058594, "epoch": 0.2080332853256521, "grad_norm": 0.2659440040588379, "kl": 0.1398515522480011, "learning_rate": 1.929908080343717e-05, "loss": 0.0441, "reward": 1.048437511920929, "reward_std": 0.11139777526259423, "rewards/accuracy_reward": 0.06250000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 669.0771118164063, "epoch": 0.2083533365338454, "grad_norm": 0.12325471639633179, "kl": 0.24901945143938065, "learning_rate": 1.9294964528382885e-05, "loss": 0.0615, "reward": 1.0468750238418578, "reward_std": 0.164197001978755, "rewards/accuracy_reward": 0.0791666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 645.5500122070313, "epoch": 0.20867338774203872, "grad_norm": 0.10141347348690033, "kl": 0.3649654157459736, "learning_rate": 1.929083664350167e-05, "loss": 0.1014, "reward": 1.0307291924953461, "reward_std": 0.14789256304502488, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791746139526, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 692.3437683105469, "epoch": 0.20899343895023204, "grad_norm": 0.09904764592647552, "kl": 0.2902476988732815, "learning_rate": 1.9286697153949436e-05, "loss": 0.0651, "reward": 1.0135416865348816, "reward_std": 0.14488410539925098, "rewards/accuracy_reward": 0.04791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 662.4937744140625, "epoch": 0.20931349015842535, "grad_norm": 0.10897406935691833, "kl": 0.20142997726798056, "learning_rate": 1.9282546064896594e-05, "loss": 0.0721, "reward": 1.078125011920929, "reward_std": 0.09604029338806867, "rewards/accuracy_reward": 0.10625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 625.9479431152344, "epoch": 0.20963354136661866, "grad_norm": 0.23808997869491577, "kl": 0.22053091898560523, "learning_rate": 1.9278383381528036e-05, "loss": 0.0576, "reward": 1.0166666805744171, "reward_std": 0.12363926023244858, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 673.3208557128906, "epoch": 0.20995359257481197, "grad_norm": 0.07611843943595886, "kl": 0.2132401891052723, "learning_rate": 1.9274209109043146e-05, "loss": 0.0521, "reward": 1.0567708492279053, "reward_std": 0.1237191118299961, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041746139527, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 634.1875122070312, "epoch": 0.21027364378300528, "grad_norm": 0.09360788017511368, "kl": 0.23806697279214858, "learning_rate": 1.927002325265577e-05, "loss": 0.0523, "reward": 1.0703125298023224, "reward_std": 0.14319055881351234, "rewards/accuracy_reward": 0.09375000428408384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 661.1416870117188, "epoch": 0.2105936949911986, "grad_norm": 0.07128574699163437, "kl": 0.16915738582611084, "learning_rate": 1.9265825817594232e-05, "loss": 0.0558, "reward": 1.0114583551883698, "reward_std": 0.08447882384061814, "rewards/accuracy_reward": 0.02708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 677.1687744140625, "epoch": 0.2109137461993919, "grad_norm": 0.06273775547742844, "kl": 0.1647022284567356, "learning_rate": 1.9261616809101317e-05, "loss": 0.0496, "reward": 1.0583333432674409, "reward_std": 0.1425313174724579, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 688.7479370117187, "epoch": 0.21123379740758522, "grad_norm": 0.11021216958761215, "kl": 0.2603888504207134, "learning_rate": 1.9257396232434266e-05, "loss": 0.0737, "reward": 1.0020833432674408, "reward_std": 0.09916403293609619, "rewards/accuracy_reward": 0.02916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 658.9229431152344, "epoch": 0.21155384861577853, "grad_norm": 0.08434461057186127, "kl": 0.16674437001347542, "learning_rate": 1.9253164092864768e-05, "loss": 0.059, "reward": 1.0015625178813934, "reward_std": 0.1232110183686018, "rewards/accuracy_reward": 0.02916666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958432674408, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 655.5833557128906, "epoch": 0.21187389982397184, "grad_norm": 0.10858671367168427, "kl": 0.13052089065313338, "learning_rate": 1.9248920395678955e-05, "loss": 0.0463, "reward": 1.1036458551883697, "reward_std": 0.11935643032193184, "rewards/accuracy_reward": 0.12500000428408384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 656.4583435058594, "epoch": 0.21219395103216515, "grad_norm": 0.050972215831279755, "kl": 0.13374503329396248, "learning_rate": 1.9244665146177395e-05, "loss": -0.0086, "reward": 1.1062500059604645, "reward_std": 0.10863641854375601, "rewards/accuracy_reward": 0.11458333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666746139526, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 646.0729309082031, "epoch": 0.21251400224035846, "grad_norm": 0.06517866253852844, "kl": 0.1450530506670475, "learning_rate": 1.9240398349675083e-05, "loss": 0.071, "reward": 1.0520833551883697, "reward_std": 0.09250790346413851, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 623.0187744140625, "epoch": 0.21283405344855177, "grad_norm": 0.10822878032922745, "kl": 0.15510641485452653, "learning_rate": 1.9236120011501442e-05, "loss": 0.0592, "reward": 1.1192708611488342, "reward_std": 0.07371204420924186, "rewards/accuracy_reward": 0.1375000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708373069763, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 662.3271057128907, "epoch": 0.21315410465674509, "grad_norm": 0.09872405230998993, "kl": 0.19749830849468708, "learning_rate": 1.9231830137000305e-05, "loss": 0.0727, "reward": 0.9979166805744171, "reward_std": 0.13651593923568725, "rewards/accuracy_reward": 0.03125000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666746139526, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 677.6062744140625, "epoch": 0.2134741558649384, "grad_norm": 0.08014009892940521, "kl": 0.16215406954288483, "learning_rate": 1.922752873152992e-05, "loss": 0.0448, "reward": 1.037500011920929, "reward_std": 0.12696239706128837, "rewards/accuracy_reward": 0.06666667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333492279053, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 658.7479309082031, "epoch": 0.2137942070731317, "grad_norm": 0.17184635996818542, "kl": 0.16198827996850013, "learning_rate": 1.9223215800462937e-05, "loss": 0.0821, "reward": 1.041666680574417, "reward_std": 0.12022981494665146, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 691.1166870117188, "epoch": 0.21411425828132502, "grad_norm": 0.1011757031083107, "kl": 0.18935470506548882, "learning_rate": 1.9218891349186394e-05, "loss": 0.0567, "reward": 1.0567708730697631, "reward_std": 0.14946944694966077, "rewards/accuracy_reward": 0.08958333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875238418579, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 652.1541870117187, "epoch": 0.21443430948951833, "grad_norm": 0.12701667845249176, "kl": 0.2506525985896587, "learning_rate": 1.9214555383101724e-05, "loss": 0.0562, "reward": 1.0416666924953462, "reward_std": 0.1605004720389843, "rewards/accuracy_reward": 0.07500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666805744171, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 610.1229461669922, "epoch": 0.21475436069771164, "grad_norm": 0.10412585735321045, "kl": 0.26857480928301813, "learning_rate": 1.9210207907624748e-05, "loss": 0.1258, "reward": 1.0447917044162751, "reward_std": 0.1726130098104477, "rewards/accuracy_reward": 0.08958333618938923, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083551883698, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 668.1708618164063, "epoch": 0.21507441190590496, "grad_norm": 0.1114473044872284, "kl": 0.3208130903542042, "learning_rate": 1.920584892818566e-05, "loss": 0.116, "reward": 0.9984375238418579, "reward_std": 0.20095318108797072, "rewards/accuracy_reward": 0.04791666734963655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208432674408, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 632.1812805175781, "epoch": 0.21539446311409827, "grad_norm": 0.33339497447013855, "kl": 0.355214512348175, "learning_rate": 1.9201478450229012e-05, "loss": 0.1135, "reward": 1.1171875238418578, "reward_std": 0.13930478543043137, "rewards/accuracy_reward": 0.1562500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375059604645, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 669.3979370117188, "epoch": 0.21571451432229158, "grad_norm": 0.10623612254858017, "kl": 0.3115902006626129, "learning_rate": 1.919709647921373e-05, "loss": 0.1144, "reward": 0.9781250298023224, "reward_std": 0.1574092723429203, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916805744172, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 647.3854370117188, "epoch": 0.2160345655304849, "grad_norm": 0.16619139909744263, "kl": 0.23366658315062522, "learning_rate": 1.9192703020613094e-05, "loss": 0.0834, "reward": 1.0635416984558106, "reward_std": 0.11643952075392008, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583551883698, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 669.1562652587891, "epoch": 0.2163546167386782, "grad_norm": 0.3299196660518646, "kl": 0.3408443845808506, "learning_rate": 1.918829807991473e-05, "loss": 0.1221, "reward": 1.0786458611488343, "reward_std": 0.18198420107364655, "rewards/accuracy_reward": 0.13125000353902577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9473958432674408, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 639.083349609375, "epoch": 0.2166746679468715, "grad_norm": 0.8060458898544312, "kl": 0.29906757101416587, "learning_rate": 1.9183881662620606e-05, "loss": 0.0866, "reward": 1.0479166805744171, "reward_std": 0.1341792933642864, "rewards/accuracy_reward": 0.08750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166746139526, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 664.6291809082031, "epoch": 0.2169947191550648, "grad_norm": 2.5255606174468994, "kl": 0.7097936183214187, "learning_rate": 1.9179453774247023e-05, "loss": 0.1361, "reward": 0.9807291865348816, "reward_std": 0.15256869401782752, "rewards/accuracy_reward": 0.04166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625178813934, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 675.3479309082031, "epoch": 0.2173147703632581, "grad_norm": 342.9512023925781, "kl": 60.940721249580385, "learning_rate": 1.9175014420324613e-05, "loss": 3.6971, "reward": 0.9463541865348816, "reward_std": 0.22579180002212523, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9151041805744171, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 627.5437683105469, "epoch": 0.21763482157145142, "grad_norm": 26.168609619140625, "kl": 7.308992192149162, "learning_rate": 1.917056360639833e-05, "loss": 0.5832, "reward": 1.0463541865348815, "reward_std": 0.18034582138061522, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541746139527, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 634.9021057128906, "epoch": 0.21795487277964473, "grad_norm": 3.199669122695923, "kl": 0.572056169807911, "learning_rate": 1.9166101338027436e-05, "loss": 0.1711, "reward": 1.009375023841858, "reward_std": 0.18491822630167007, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9302083492279053, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 636.4750183105468, "epoch": 0.21827492398783804, "grad_norm": 2.4151406288146973, "kl": 0.709225732088089, "learning_rate": 1.916162762078551e-05, "loss": 0.1667, "reward": 0.9307291805744171, "reward_std": 0.1792273811995983, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9265625059604645, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 654.2396057128906, "epoch": 0.21859497519603135, "grad_norm": 1.0843818187713623, "kl": 1.1302322834730147, "learning_rate": 1.915714246026042e-05, "loss": 0.1616, "reward": 1.0307291865348815, "reward_std": 0.17401037737727165, "rewards/accuracy_reward": 0.08541666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125178813935, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 617.0979309082031, "epoch": 0.21891502640422467, "grad_norm": 0.26156195998191833, "kl": 0.5273042991757393, "learning_rate": 1.915264586205433e-05, "loss": 0.1901, "reward": 0.9677083551883697, "reward_std": 0.193864406645298, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9197916805744171, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 649.1666931152344, "epoch": 0.21923507761241798, "grad_norm": 0.3921996057033539, "kl": 0.7704705983400345, "learning_rate": 1.91481378317837e-05, "loss": 0.2797, "reward": 0.9046875238418579, "reward_std": 0.34336878657341, "rewards/accuracy_reward": 0.07500000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8296875119209289, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 715.9125244140625, "epoch": 0.2195551288206113, "grad_norm": 0.4275372326374054, "kl": 0.7206552475690842, "learning_rate": 1.9143618375079257e-05, "loss": 0.2251, "reward": 0.9151041805744171, "reward_std": 0.33101013153791425, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.817187511920929, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 706.3562744140625, "epoch": 0.2198751800288046, "grad_norm": 0.6302100419998169, "kl": 1.1008682191371917, "learning_rate": 1.9139087497586004e-05, "loss": 0.3284, "reward": 0.7807291865348815, "reward_std": 0.3698259711265564, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7098958492279053, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 666.8437744140625, "epoch": 0.2201952312369979, "grad_norm": 0.3019793927669525, "kl": 0.6823631256818772, "learning_rate": 1.9134545204963214e-05, "loss": 0.2566, "reward": 0.8682291805744171, "reward_std": 0.32157149612903596, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8182291865348816, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 605.3000213623047, "epoch": 0.22051528244519122, "grad_norm": 0.252623587846756, "kl": 0.4231353387236595, "learning_rate": 1.912999150288441e-05, "loss": 0.236, "reward": 0.912500011920929, "reward_std": 0.24352166503667833, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8958333373069763, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 625.7562805175781, "epoch": 0.22083533365338454, "grad_norm": 0.2588579058647156, "kl": 0.4063556343317032, "learning_rate": 1.912542639703737e-05, "loss": 0.1746, "reward": 0.9609375298023224, "reward_std": 0.25126550942659376, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9067708611488342, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 628.4041870117187, "epoch": 0.22115538486157785, "grad_norm": 0.35281902551651, "kl": 0.36444804519414903, "learning_rate": 1.912084989312412e-05, "loss": 0.1878, "reward": 0.9937500178813934, "reward_std": 0.21244567185640334, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9125000059604644, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 605.5083435058593, "epoch": 0.22147543606977116, "grad_norm": 1.7048529386520386, "kl": 0.3061882697045803, "learning_rate": 1.9116261996860914e-05, "loss": 0.1228, "reward": 0.9671875238418579, "reward_std": 0.17979936115443707, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541805744171, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 575.952099609375, "epoch": 0.22179548727796447, "grad_norm": 1.6680268049240112, "kl": 0.5593406990170479, "learning_rate": 1.9111662713978242e-05, "loss": 0.0753, "reward": 1.0125000119209289, "reward_std": 0.06362286508083344, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 612.7687683105469, "epoch": 0.22211553848615778, "grad_norm": 43.46950912475586, "kl": 12.55296850502491, "learning_rate": 1.9107052050220808e-05, "loss": 0.6179, "reward": 1.1260417103767395, "reward_std": 0.13523164130747317, "rewards/accuracy_reward": 0.14791667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 573.4521026611328, "epoch": 0.2224355896943511, "grad_norm": 47.668182373046875, "kl": 10.92622417807579, "learning_rate": 1.910243001134755e-05, "loss": 0.739, "reward": 1.0005208551883698, "reward_std": 0.08894652742892503, "rewards/accuracy_reward": 0.02291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 619.3916870117188, "epoch": 0.2227556409025444, "grad_norm": 0.6365566253662109, "kl": 0.6296287894248962, "learning_rate": 1.909779660313159e-05, "loss": 0.0362, "reward": 1.0015625178813934, "reward_std": 0.1047313479706645, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291746139526, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 631.7125183105469, "epoch": 0.22307569211073772, "grad_norm": 0.41112789511680603, "kl": 0.15625541731715203, "learning_rate": 1.9093151831360268e-05, "loss": 0.02, "reward": 1.0812500178813935, "reward_std": 0.11710264217108488, "rewards/accuracy_reward": 0.10208333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 627.0625183105469, "epoch": 0.22339574331893103, "grad_norm": 0.29101845622062683, "kl": 0.17532607764005662, "learning_rate": 1.9088495701835113e-05, "loss": 0.022, "reward": 1.0437500059604645, "reward_std": 0.07715525384992361, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833373069763, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 634.1250122070312, "epoch": 0.22371579452712434, "grad_norm": 0.20614954829216003, "kl": 0.15920972526073457, "learning_rate": 1.9083828220371835e-05, "loss": 0.039, "reward": 1.0114583551883698, "reward_std": 0.12535146437585354, "rewards/accuracy_reward": 0.04166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 598.0562683105469, "epoch": 0.22403584573531765, "grad_norm": 0.13199330866336823, "kl": 0.1495030015707016, "learning_rate": 1.907914939280033e-05, "loss": 0.0005, "reward": 0.9937500298023224, "reward_std": 0.07663525212556124, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166805744171, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 638.8729309082031, "epoch": 0.22435589694351096, "grad_norm": 0.06893268972635269, "kl": 0.12032232657074929, "learning_rate": 1.907445922496466e-05, "loss": 0.0137, "reward": 0.9947916865348816, "reward_std": 0.04492462687194347, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990625011920929, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 628.9604309082031, "epoch": 0.22467594815170427, "grad_norm": 0.11694058030843735, "kl": 0.14955301880836486, "learning_rate": 1.906975772272306e-05, "loss": 0.0213, "reward": 1.0473958492279052, "reward_std": 0.10141881592571736, "rewards/accuracy_reward": 0.06875000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 664.6687744140625, "epoch": 0.22499599935989759, "grad_norm": 0.09218557178974152, "kl": 0.1656613454222679, "learning_rate": 1.906504489194791e-05, "loss": 0.0414, "reward": 1.0697916924953461, "reward_std": 0.11464045755565166, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083432674408, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 639.5875305175781, "epoch": 0.2253160505680909, "grad_norm": 0.15477770566940308, "kl": 0.163698972761631, "learning_rate": 1.9060320738525756e-05, "loss": 0.0578, "reward": 1.0651041865348816, "reward_std": 0.12406023722141982, "rewards/accuracy_reward": 0.09375000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541805744171, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 636.4208557128907, "epoch": 0.2256361017762842, "grad_norm": 0.06102179363369942, "kl": 0.14229050129652024, "learning_rate": 1.905558526835727e-05, "loss": 0.001, "reward": 0.9911458492279053, "reward_std": 0.06305509340018034, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 634.3583618164063, "epoch": 0.22595615298447752, "grad_norm": 0.08625641465187073, "kl": 0.12337017208337783, "learning_rate": 1.9050838487357267e-05, "loss": 0.0239, "reward": 1.0260416865348816, "reward_std": 0.09488309193402529, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416746139526, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 671.083349609375, "epoch": 0.22627620419267083, "grad_norm": 0.10532180964946747, "kl": 0.14307744055986404, "learning_rate": 1.904608040145469e-05, "loss": 0.0231, "reward": 1.0723958551883697, "reward_std": 0.12180216908454895, "rewards/accuracy_reward": 0.09375000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458373069763, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 612.1125183105469, "epoch": 0.22659625540086414, "grad_norm": 0.14669953286647797, "kl": 0.16547591611742973, "learning_rate": 1.9041311016592603e-05, "loss": 0.0294, "reward": 1.0380208551883698, "reward_std": 0.12260267194360494, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875059604645, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 619.3270935058594, "epoch": 0.22691630660905746, "grad_norm": 0.12648318707942963, "kl": 0.16989169344305993, "learning_rate": 1.903653033872818e-05, "loss": 0.0318, "reward": 1.1723958611488343, "reward_std": 0.12271066904067993, "rewards/accuracy_reward": 0.18958333879709244, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125059604644, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 641.5833618164063, "epoch": 0.22723635781725077, "grad_norm": 0.3657815158367157, "kl": 0.27132780849933624, "learning_rate": 1.90317383738327e-05, "loss": 0.0943, "reward": 0.9718750178813934, "reward_std": 0.16001009345054626, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416805744171, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 589.6895935058594, "epoch": 0.22755640902544408, "grad_norm": 0.18482835590839386, "kl": 0.19547294229269027, "learning_rate": 1.902693512789154e-05, "loss": 0.0638, "reward": 1.1067708611488343, "reward_std": 0.07665946874767542, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 593.2645965576172, "epoch": 0.2278764602336374, "grad_norm": 0.45472854375839233, "kl": 0.2865824416279793, "learning_rate": 1.902212060690418e-05, "loss": 0.0903, "reward": 1.0869791984558106, "reward_std": 0.16627902090549468, "rewards/accuracy_reward": 0.11875000428408385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 559.1000244140625, "epoch": 0.2281965114418307, "grad_norm": 0.3362894058227539, "kl": 0.2055267460644245, "learning_rate": 1.901729481688416e-05, "loss": 0.0843, "reward": 1.0875000298023223, "reward_std": 0.09311237446963787, "rewards/accuracy_reward": 0.11041667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 652.6833557128906, "epoch": 0.228516562650024, "grad_norm": 0.3454423248767853, "kl": 0.3243007093667984, "learning_rate": 1.9012457763859117e-05, "loss": 0.1082, "reward": 1.0291666865348816, "reward_std": 0.13885583207011223, "rewards/accuracy_reward": 0.0645833345130086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 568.847933959961, "epoch": 0.22883661385821732, "grad_norm": 0.3572365641593933, "kl": 0.4481654688715935, "learning_rate": 1.9007609453870738e-05, "loss": 0.1275, "reward": 1.0604166865348816, "reward_std": 0.16449397206306457, "rewards/accuracy_reward": 0.0958333371207118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 546.216683959961, "epoch": 0.22915666506641064, "grad_norm": 0.6198186278343201, "kl": 0.7278454639017582, "learning_rate": 1.9002749892974785e-05, "loss": 0.1337, "reward": 1.1057291865348815, "reward_std": 0.14216041043400765, "rewards/accuracy_reward": 0.13125000502914191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791686534882, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 611.1416839599609, "epoch": 0.22947671627460395, "grad_norm": 0.6964424252510071, "kl": 0.856408603489399, "learning_rate": 1.8997879087241065e-05, "loss": 0.1282, "reward": 1.0177083492279053, "reward_std": 0.11686915159225464, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 628.9312622070313, "epoch": 0.22979676748279726, "grad_norm": 0.7025728821754456, "kl": 0.38182810619473456, "learning_rate": 1.8992997042753437e-05, "loss": 0.0746, "reward": 1.1567708611488343, "reward_std": 0.10790105611085891, "rewards/accuracy_reward": 0.170833339355886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 610.341683959961, "epoch": 0.23011681869099057, "grad_norm": 1.644096851348877, "kl": 1.7317875981330872, "learning_rate": 1.8988103765609788e-05, "loss": 0.1786, "reward": 1.080729180574417, "reward_std": 0.1288726843893528, "rewards/accuracy_reward": 0.1020833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458373069763, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 678.0250183105469, "epoch": 0.23043686989918388, "grad_norm": 0.5850602984428406, "kl": 1.0677958868443966, "learning_rate": 1.898319926192204e-05, "loss": 0.09, "reward": 1.0921875298023225, "reward_std": 0.07778571378439665, "rewards/accuracy_reward": 0.11041666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 662.208349609375, "epoch": 0.2307569211073772, "grad_norm": 7.026660919189453, "kl": 3.291440422087908, "learning_rate": 1.897828353781614e-05, "loss": 0.2707, "reward": 0.9755208551883697, "reward_std": 0.10903428643941879, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708432674408, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 698.3687744140625, "epoch": 0.23107697231557048, "grad_norm": 0.7763417363166809, "kl": 0.5105241164565086, "learning_rate": 1.897335659943205e-05, "loss": 0.0862, "reward": 1.010416680574417, "reward_std": 0.09227172508835793, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 639.0229339599609, "epoch": 0.2313970235237638, "grad_norm": 3.8951375484466553, "kl": 2.1611140362918375, "learning_rate": 1.8968418452923735e-05, "loss": 0.2017, "reward": 1.1088541865348815, "reward_std": 0.1484844669699669, "rewards/accuracy_reward": 0.13750000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541686534881, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 673.3583557128907, "epoch": 0.2317170747319571, "grad_norm": 0.4952332675457001, "kl": 0.6473507910966874, "learning_rate": 1.8963469104459157e-05, "loss": 0.0957, "reward": 1.062500011920929, "reward_std": 0.1178272632881999, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833373069763, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 638.5625244140625, "epoch": 0.2320371259401504, "grad_norm": 105.82144165039062, "kl": 18.172766876220702, "learning_rate": 1.8958508560220276e-05, "loss": 1.323, "reward": 1.0343750238418579, "reward_std": 0.1931321881711483, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083432674408, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 641.7812683105469, "epoch": 0.23235717714834372, "grad_norm": 1.5600378513336182, "kl": 0.5689830243587494, "learning_rate": 1.8953536826403035e-05, "loss": 0.102, "reward": 1.1260416984558106, "reward_std": 0.17821024954319, "rewards/accuracy_reward": 0.1562500052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916746139527, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 634.5479309082032, "epoch": 0.23267722835653704, "grad_norm": 12.669792175292969, "kl": 4.572777527570724, "learning_rate": 1.8948553909217354e-05, "loss": 0.4281, "reward": 1.0020833492279053, "reward_std": 0.20425619408488274, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666805744171, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 642.2770935058594, "epoch": 0.23299727956473035, "grad_norm": 1.4605772495269775, "kl": 1.2693428099155426, "learning_rate": 1.894355981488712e-05, "loss": 0.1494, "reward": 1.027604192495346, "reward_std": 0.14785205852240324, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208432674408, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 627.6437744140625, "epoch": 0.23331733077292366, "grad_norm": 1.3393751382827759, "kl": 1.4025199614465236, "learning_rate": 1.8938554549650172e-05, "loss": 0.1719, "reward": 1.0317708432674408, "reward_std": 0.13696985617280005, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875178813934, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 625.658349609375, "epoch": 0.23363738198111697, "grad_norm": 4.692083358764648, "kl": 4.138763834536076, "learning_rate": 1.893353811975832e-05, "loss": 0.3455, "reward": 1.0473958373069763, "reward_std": 0.1992236189544201, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625059604645, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 608.952099609375, "epoch": 0.23395743318931028, "grad_norm": 1.7119262218475342, "kl": 1.134942190349102, "learning_rate": 1.8928510531477305e-05, "loss": 0.181, "reward": 0.9755208551883697, "reward_std": 0.1736592784523964, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9380208432674408, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 600.6604309082031, "epoch": 0.2342774843975036, "grad_norm": 2.9206032752990723, "kl": 2.6096622347831726, "learning_rate": 1.892347179108681e-05, "loss": 0.2967, "reward": 0.9921875178813935, "reward_std": 0.1957916386425495, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9401041865348816, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 582.6271118164062, "epoch": 0.2345975356056969, "grad_norm": 10.899141311645508, "kl": 5.642543570697308, "learning_rate": 1.891842190488045e-05, "loss": 0.5927, "reward": 1.045312523841858, "reward_std": 0.19487025067210198, "rewards/accuracy_reward": 0.1062500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625178813934, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 591.308349609375, "epoch": 0.23491758681389022, "grad_norm": 9.065811157226562, "kl": 4.151773124933243, "learning_rate": 1.891336087916576e-05, "loss": 0.4228, "reward": 0.9750000238418579, "reward_std": 0.20050331354141235, "rewards/accuracy_reward": 0.050000001303851606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.925000011920929, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 578.2458557128906, "epoch": 0.23523763802208353, "grad_norm": 2.479764223098755, "kl": 0.7591698169708252, "learning_rate": 1.8908288720264184e-05, "loss": 0.2353, "reward": 1.027604204416275, "reward_std": 0.23811267241835593, "rewards/accuracy_reward": 0.12083333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9067708492279053, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 543.2541809082031, "epoch": 0.23555768923027684, "grad_norm": 3.136521577835083, "kl": 0.9379741698503494, "learning_rate": 1.8903205434511072e-05, "loss": 0.2684, "reward": 1.041666680574417, "reward_std": 0.1941637597978115, "rewards/accuracy_reward": 0.11458333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9270833432674408, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 536.9104309082031, "epoch": 0.23587774043847015, "grad_norm": 15.852173805236816, "kl": 8.845919364690781, "learning_rate": 1.8898111028255686e-05, "loss": 0.8396, "reward": 1.0156250238418578, "reward_std": 0.1994689255952835, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9322916865348816, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 553.114599609375, "epoch": 0.23619779164666346, "grad_norm": 10.610895156860352, "kl": 7.209189605712891, "learning_rate": 1.889300550786116e-05, "loss": 0.7747, "reward": 1.0208333492279054, "reward_std": 0.21294072940945624, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9125000178813935, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 553.3521087646484, "epoch": 0.23651784285485677, "grad_norm": 1.1002973318099976, "kl": 2.1927175372838974, "learning_rate": 1.888788887970452e-05, "loss": 0.3327, "reward": 0.9885416805744172, "reward_std": 0.19914889633655547, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9156250119209289, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 542.7937713623047, "epoch": 0.2368378940630501, "grad_norm": 0.6471092104911804, "kl": 2.1248645067214964, "learning_rate": 1.888276115017666e-05, "loss": 0.3223, "reward": 0.9750000238418579, "reward_std": 0.16674922611564397, "rewards/accuracy_reward": 0.03333333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9416666865348816, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 557.6062744140625, "epoch": 0.2371579452712434, "grad_norm": 2.839332342147827, "kl": 3.6467182874679565, "learning_rate": 1.887762232568235e-05, "loss": 0.4353, "reward": 1.0088541865348817, "reward_std": 0.23403916209936143, "rewards/accuracy_reward": 0.07291666809469462, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.935937511920929, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 535.931265258789, "epoch": 0.2374779964794367, "grad_norm": 2.3406782150268555, "kl": 1.8981781423091888, "learning_rate": 1.8872472412640207e-05, "loss": 0.339, "reward": 1.0604166984558105, "reward_std": 0.19939529821276664, "rewards/accuracy_reward": 0.12708333637565375, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9312500178813934, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 543.2083587646484, "epoch": 0.23779804768763002, "grad_norm": 2.462381601333618, "kl": 2.055042415857315, "learning_rate": 1.8867311417482707e-05, "loss": 0.3379, "reward": 0.9421875178813934, "reward_std": 0.21793267950415612, "rewards/accuracy_reward": 0.022916667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9192708551883697, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 497.708349609375, "epoch": 0.23811809889582333, "grad_norm": 9.371177673339844, "kl": 7.4982593089342116, "learning_rate": 1.886213934665616e-05, "loss": 0.6855, "reward": 0.9718750298023224, "reward_std": 0.2046487707644701, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9260416865348816, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 551.5521057128906, "epoch": 0.23843815010401664, "grad_norm": 4.96921443939209, "kl": 6.159427142143249, "learning_rate": 1.8856956206620717e-05, "loss": 0.6304, "reward": 0.9744791984558105, "reward_std": 0.2073620229959488, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.907812523841858, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 550.2291809082031, "epoch": 0.23875820131220996, "grad_norm": 2.8724591732025146, "kl": 1.0669405221939088, "learning_rate": 1.8851762003850348e-05, "loss": 0.2729, "reward": 1.002604204416275, "reward_std": 0.2241446740925312, "rewards/accuracy_reward": 0.07291666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9296875238418579, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 570.4291778564453, "epoch": 0.23907825252040327, "grad_norm": 2.7185521125793457, "kl": 1.5261371374130248, "learning_rate": 1.8846556744832852e-05, "loss": 0.3374, "reward": 0.967187511920929, "reward_std": 0.272777758538723, "rewards/accuracy_reward": 0.08333333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8838541805744171, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 534.4187713623047, "epoch": 0.23939830372859658, "grad_norm": 1.0410653352737427, "kl": 1.700943198800087, "learning_rate": 1.8841340436069825e-05, "loss": 0.3202, "reward": 0.9619791865348816, "reward_std": 0.23116603791713713, "rewards/accuracy_reward": 0.03541666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.926562511920929, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 471.2875183105469, "epoch": 0.2397183549367899, "grad_norm": 4.877148628234863, "kl": 3.820130455493927, "learning_rate": 1.8836113084076673e-05, "loss": 0.5104, "reward": 1.028125023841858, "reward_std": 0.2020101472735405, "rewards/accuracy_reward": 0.0937500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9343750059604645, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 557.7541778564453, "epoch": 0.2400384061449832, "grad_norm": 11.63560676574707, "kl": 8.239775601029397, "learning_rate": 1.883087469538259e-05, "loss": 0.7489, "reward": 0.9447916865348815, "reward_std": 0.1876985676586628, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.934375011920929, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 528.8208435058593, "epoch": 0.2403584573531765, "grad_norm": 8.246529579162598, "kl": 6.4461568117141725, "learning_rate": 1.8825625276530558e-05, "loss": 0.701, "reward": 1.0072916865348815, "reward_std": 0.23445617109537126, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9010416805744171, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 498.65834655761716, "epoch": 0.24067850856136982, "grad_norm": 1.731472134590149, "kl": 1.476518702507019, "learning_rate": 1.882036483407734e-05, "loss": 0.3387, "reward": 1.0395833551883698, "reward_std": 0.2580983817577362, "rewards/accuracy_reward": 0.10000000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9395833373069763, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 509.2354370117188, "epoch": 0.24099855976956314, "grad_norm": 1.2525382041931152, "kl": 1.777240651845932, "learning_rate": 1.8815093374593463e-05, "loss": 0.3417, "reward": 0.9989583551883697, "reward_std": 0.22307676412165164, "rewards/accuracy_reward": 0.07500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9239583492279053, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 526.858349609375, "epoch": 0.24131861097775645, "grad_norm": 1.0679889917373657, "kl": 2.1552729278802873, "learning_rate": 1.880981090466321e-05, "loss": 0.4127, "reward": 0.9875000238418579, "reward_std": 0.20257178843021392, "rewards/accuracy_reward": 0.06250000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.925000011920929, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 521.0416870117188, "epoch": 0.24163866218594976, "grad_norm": 1.1074210405349731, "kl": 1.7916708946228028, "learning_rate": 1.8804517430884633e-05, "loss": 0.3344, "reward": 0.9458333551883698, "reward_std": 0.2129554446786642, "rewards/accuracy_reward": 0.025000000558793544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9208333492279053, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 547.1396026611328, "epoch": 0.24195871339414307, "grad_norm": 0.8018332719802856, "kl": 2.069081211090088, "learning_rate": 1.879921295986951e-05, "loss": 0.359, "reward": 1.0026041865348816, "reward_std": 0.21627548113465309, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9213541805744171, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 541.2937683105469, "epoch": 0.24227876460233638, "grad_norm": 0.5721918344497681, "kl": 2.462398773431778, "learning_rate": 1.879389749824336e-05, "loss": 0.3709, "reward": 0.9510416805744171, "reward_std": 0.23518796935677527, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9010416805744171, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 628.2625183105469, "epoch": 0.2425988158105297, "grad_norm": 1.7147362232208252, "kl": 2.181269180774689, "learning_rate": 1.8788571052645448e-05, "loss": 0.3649, "reward": 0.9447916924953461, "reward_std": 0.26175126880407334, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8822916805744171, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 602.789599609375, "epoch": 0.242918867018723, "grad_norm": 1.032857060432434, "kl": 3.850922179222107, "learning_rate": 1.8783233629728725e-05, "loss": 0.4549, "reward": 0.8744791865348815, "reward_std": 0.2859712585806847, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8536458432674408, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 640.3812622070312, "epoch": 0.24323891822691632, "grad_norm": 1.8498499393463135, "kl": 2.852463459968567, "learning_rate": 1.877788523615988e-05, "loss": 0.3521, "reward": 0.8567708551883697, "reward_std": 0.29042457044124603, "rewards/accuracy_reward": 0.020833334326744078, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8359375178813935, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 670.9916809082031, "epoch": 0.24355896943510963, "grad_norm": 3.006237030029297, "kl": 6.155748796463013, "learning_rate": 1.87725258786193e-05, "loss": 0.6083, "reward": 0.8255208492279053, "reward_std": 0.2984800562262535, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7880208492279053, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 622.9521118164063, "epoch": 0.24387902064330294, "grad_norm": 1.1661911010742188, "kl": 3.726873683929443, "learning_rate": 1.8767155563801053e-05, "loss": 0.4531, "reward": 0.8677083551883698, "reward_std": 0.28935869932174685, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7947916865348816, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 623.7646057128907, "epoch": 0.24419907185149625, "grad_norm": 2.272340774536133, "kl": 4.760553753376007, "learning_rate": 1.8761774298412905e-05, "loss": 0.5509, "reward": 0.8604166984558106, "reward_std": 0.2656098708510399, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8250000238418579, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 615.7062683105469, "epoch": 0.24451912305968956, "grad_norm": 1.5633440017700195, "kl": 1.859854531288147, "learning_rate": 1.8756382089176303e-05, "loss": 0.3324, "reward": 0.9937500059604645, "reward_std": 0.2956600204110146, "rewards/accuracy_reward": 0.1062500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.887500011920929, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 625.6916900634766, "epoch": 0.24483917426788285, "grad_norm": 0.5739659667015076, "kl": 2.4789600491523744, "learning_rate": 1.8750978942826353e-05, "loss": 0.3688, "reward": 0.9510416924953461, "reward_std": 0.26511411666870116, "rewards/accuracy_reward": 0.08333333432674409, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8677083492279053, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 605.0354431152343, "epoch": 0.24515922547607616, "grad_norm": 0.3142963647842407, "kl": 1.9652863681316375, "learning_rate": 1.874556486611183e-05, "loss": 0.3312, "reward": 0.9567708730697632, "reward_std": 0.3204138189554214, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8796875238418579, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 627.9125122070312, "epoch": 0.24547927668426947, "grad_norm": 0.2902325987815857, "kl": 2.781359338760376, "learning_rate": 1.8740139865795154e-05, "loss": 0.4133, "reward": 0.8859375238418579, "reward_std": 0.28512853384017944, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8380208551883698, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 600.4104431152343, "epoch": 0.24579932789246278, "grad_norm": 1.0774624347686768, "kl": 2.3280814051628114, "learning_rate": 1.8734703948652398e-05, "loss": 0.4042, "reward": 0.8906250178813935, "reward_std": 0.28475052416324614, "rewards/accuracy_reward": 0.043750002048909664, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.846875011920929, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 637.6333618164062, "epoch": 0.2461193791006561, "grad_norm": 2.035698413848877, "kl": 4.03775839805603, "learning_rate": 1.8729257121473262e-05, "loss": 0.474, "reward": 0.8588541805744171, "reward_std": 0.31844222843647, "rewards/accuracy_reward": 0.025000001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8338541805744171, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 611.2562683105468, "epoch": 0.2464394303088494, "grad_norm": 0.49926403164863586, "kl": 3.1284287214279174, "learning_rate": 1.872379939106108e-05, "loss": 0.397, "reward": 0.8848958492279053, "reward_std": 0.3051090121269226, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8453125059604645, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 649.7354370117188, "epoch": 0.24675948151704272, "grad_norm": 0.631862998008728, "kl": 3.050023341178894, "learning_rate": 1.8718330764232802e-05, "loss": 0.4031, "reward": 0.9828125357627868, "reward_std": 0.29355679303407667, "rewards/accuracy_reward": 0.15208333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8307291865348816, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 763.3729309082031, "epoch": 0.24707953272523603, "grad_norm": 5.6362738609313965, "kl": 7.783353233337403, "learning_rate": 1.8712851247818985e-05, "loss": 0.6343, "reward": 0.7375000238418579, "reward_std": 0.3411813169717789, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6708333551883697, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 781.3562805175782, "epoch": 0.24739958393342934, "grad_norm": 1.8554197549819946, "kl": 5.768043446540832, "learning_rate": 1.870736084866379e-05, "loss": 0.4355, "reward": 0.6088541805744171, "reward_std": 0.342638885974884, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6026041805744171, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 808.0021057128906, "epoch": 0.24771963514162265, "grad_norm": 3.9803450107574463, "kl": 1.709758222103119, "learning_rate": 1.8701859573624975e-05, "loss": 0.1949, "reward": 0.8083333551883698, "reward_std": 0.2954397678375244, "rewards/accuracy_reward": 0.11250000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6958333551883698, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 839.1916870117187, "epoch": 0.24803968634981596, "grad_norm": 3.2302024364471436, "kl": 1.209952062368393, "learning_rate": 1.869634742957388e-05, "loss": 0.1301, "reward": 0.7463541865348816, "reward_std": 0.30243373960256575, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7255208551883697, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 786.752099609375, "epoch": 0.24835973755800927, "grad_norm": 2.0077474117279053, "kl": 1.2873852461576463, "learning_rate": 1.8690824423395412e-05, "loss": 0.1697, "reward": 0.9005208551883698, "reward_std": 0.31336794048547745, "rewards/accuracy_reward": 0.1333333408460021, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.767187523841858, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 678.7021057128907, "epoch": 0.2486797887662026, "grad_norm": 1.3721160888671875, "kl": 1.352865958213806, "learning_rate": 1.868529056198806e-05, "loss": 0.2235, "reward": 0.9020833551883698, "reward_std": 0.26901768147945404, "rewards/accuracy_reward": 0.04166666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8604166865348816, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 718.2958557128907, "epoch": 0.2489998399743959, "grad_norm": 1.037018060684204, "kl": 1.5384167373180389, "learning_rate": 1.867974585226386e-05, "loss": 0.1842, "reward": 0.9026041865348816, "reward_std": 0.22797319442033767, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8859375178813934, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 786.62294921875, "epoch": 0.2493198911825892, "grad_norm": 1.917616605758667, "kl": 2.3354121506214143, "learning_rate": 1.8674190301148406e-05, "loss": 0.1736, "reward": 0.9625000357627869, "reward_std": 0.23147097155451773, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8979166924953461, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 856.283349609375, "epoch": 0.24963994239078252, "grad_norm": 1.3474206924438477, "kl": 2.2255136251449583, "learning_rate": 1.866862391558083e-05, "loss": 0.0977, "reward": 0.8802083551883697, "reward_std": 0.20946567207574845, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8093750238418579, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 815.7333557128907, "epoch": 0.24995999359897583, "grad_norm": 0.525276780128479, "kl": 1.391992512345314, "learning_rate": 1.8663046702513795e-05, "loss": 0.0309, "reward": 0.7567708432674408, "reward_std": 0.13151366412639617, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7213541805744171, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 929.2333557128907, "epoch": 0.25028004480716914, "grad_norm": 0.5646325945854187, "kl": 1.0617589622735977, "learning_rate": 1.8657458668913493e-05, "loss": 0.0313, "reward": 0.7848958671092987, "reward_std": 0.0879902821034193, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7182291924953461, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 826.2625122070312, "epoch": 0.2506000960153625, "grad_norm": 1.0494465827941895, "kl": 0.8743727058172226, "learning_rate": 1.8651859821759623e-05, "loss": 0.0169, "reward": 0.7416666924953461, "reward_std": 0.1264648325741291, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7062500238418579, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 754.9791870117188, "epoch": 0.25092014722355577, "grad_norm": 0.7133949398994446, "kl": 1.71711206138134, "learning_rate": 1.8646250168045402e-05, "loss": 0.0536, "reward": 0.8036458551883697, "reward_std": 0.1877228483557701, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7036458551883698, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 789.0812683105469, "epoch": 0.2512401984317491, "grad_norm": 34.78798294067383, "kl": 4.583227729797363, "learning_rate": 1.8640629714777536e-05, "loss": 0.1088, "reward": 0.8328125178813934, "reward_std": 0.23548691123723983, "rewards/accuracy_reward": 0.01666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8161458492279052, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 669.0083557128906, "epoch": 0.2515602496399424, "grad_norm": 1.3595727682113647, "kl": 2.5710567235946655, "learning_rate": 1.8634998468976225e-05, "loss": 0.2484, "reward": 0.7208333611488342, "reward_std": 0.17345971316099168, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6875000298023224, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 772.6666870117188, "epoch": 0.2518803008481357, "grad_norm": 1.8687893152236938, "kl": 3.129037153720856, "learning_rate": 1.862935643767514e-05, "loss": 0.1782, "reward": 0.6364583551883698, "reward_std": 0.20351852625608444, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6010416865348815, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 683.4479370117188, "epoch": 0.252200352056329, "grad_norm": 0.4782872200012207, "kl": 2.446159327030182, "learning_rate": 1.862370362792144e-05, "loss": 0.217, "reward": 0.7505208671092987, "reward_std": 0.19110870510339736, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6713541924953461, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 703.6479370117188, "epoch": 0.2525204032645223, "grad_norm": 0.7935335636138916, "kl": 2.2065913677215576, "learning_rate": 1.8618040046775727e-05, "loss": 0.1883, "reward": 0.7151041805744172, "reward_std": 0.19547585546970367, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375178813935, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 702.8125183105469, "epoch": 0.25284045447271564, "grad_norm": 1.0700191259384155, "kl": 1.690595942735672, "learning_rate": 1.8612365701312075e-05, "loss": 0.1709, "reward": 0.6677083492279052, "reward_std": 0.1768379256129265, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6656250178813934, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 721.0458435058594, "epoch": 0.2531605056809089, "grad_norm": 0.8433765769004822, "kl": 1.9942040205001832, "learning_rate": 1.8606680598617995e-05, "loss": 0.1587, "reward": 0.6973958551883698, "reward_std": 0.1701609805226326, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6640625178813935, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 667.1479370117188, "epoch": 0.25348055688910226, "grad_norm": 4.201164722442627, "kl": 2.1893013775348664, "learning_rate": 1.8600984745794438e-05, "loss": 0.1323, "reward": 0.6645833551883698, "reward_std": 0.1957714796066284, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6312500238418579, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 891.3812683105468, "epoch": 0.25380060809729554, "grad_norm": 0.30567294359207153, "kl": 2.2855528831481933, "learning_rate": 1.859527814995577e-05, "loss": 0.0917, "reward": 0.7296875178813934, "reward_std": 0.17712628692388535, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6630208492279053, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 886.8958618164063, "epoch": 0.2541206593054889, "grad_norm": 0.6789547204971313, "kl": 1.546866774559021, "learning_rate": 1.858956081822979e-05, "loss": 0.0331, "reward": 0.7739583611488342, "reward_std": 0.16994911432266235, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6739583551883698, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 845.0812744140625, "epoch": 0.25444071051368217, "grad_norm": 0.2693403363227844, "kl": 1.9320049643516541, "learning_rate": 1.8583832757757708e-05, "loss": 0.1012, "reward": 0.7270833611488342, "reward_std": 0.17218801006674767, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6604166865348816, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 863.764599609375, "epoch": 0.2547607617218755, "grad_norm": 0.27951666712760925, "kl": 2.302595019340515, "learning_rate": 1.8578093975694116e-05, "loss": 0.0908, "reward": 0.7333333551883697, "reward_std": 0.17331424206495286, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6645833492279053, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 923.2771057128906, "epoch": 0.2550808129300688, "grad_norm": 0.3106688857078552, "kl": 2.457894867658615, "learning_rate": 1.8572344479207015e-05, "loss": 0.0761, "reward": 0.6953125238418579, "reward_std": 0.17134494259953498, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6619791865348816, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 939.3854431152344, "epoch": 0.25540086413826213, "grad_norm": 0.24902932345867157, "kl": 1.85448357462883, "learning_rate": 1.8566584275477783e-05, "loss": 0.0579, "reward": 0.6687500238418579, "reward_std": 0.15060244724154473, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6687500238418579, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 958.8750122070312, "epoch": 0.2557209153464554, "grad_norm": 0.6678584218025208, "kl": 1.8299700140953064, "learning_rate": 1.8560813371701174e-05, "loss": 0.0511, "reward": 0.6729166865348816, "reward_std": 0.15844282358884812, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6729166865348816, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 935.5291809082031, "epoch": 0.25604096655464875, "grad_norm": 0.3589387834072113, "kl": 2.199638992547989, "learning_rate": 1.8555031775085307e-05, "loss": 0.0748, "reward": 0.7260416924953461, "reward_std": 0.17357457876205445, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6906250238418579, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 925.0354370117187, "epoch": 0.25636101776284204, "grad_norm": 0.577064573764801, "kl": 2.5014767736196517, "learning_rate": 1.854923949285165e-05, "loss": 0.1002, "reward": 0.7541666924953461, "reward_std": 0.1812018111348152, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7145833551883698, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 835.470849609375, "epoch": 0.2566810689710354, "grad_norm": 0.31030163168907166, "kl": 2.0380566120147705, "learning_rate": 1.8543436532235024e-05, "loss": 0.0894, "reward": 0.8473958611488343, "reward_std": 0.21838683038949966, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7807291865348815, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 757.6479248046875, "epoch": 0.25700112017922866, "grad_norm": 0.29996275901794434, "kl": 1.3410570591688156, "learning_rate": 1.853762290048359e-05, "loss": 0.1172, "reward": 0.9130208492279053, "reward_std": 0.21495410203933715, "rewards/accuracy_reward": 0.04166666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8713541805744172, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 649.0166931152344, "epoch": 0.257321171387422, "grad_norm": 0.4481050968170166, "kl": 0.744247005879879, "learning_rate": 1.853179860485883e-05, "loss": 0.0867, "reward": 1.0031250298023224, "reward_std": 0.19283585250377655, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8864583492279052, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 598.3208557128906, "epoch": 0.2576412225956153, "grad_norm": 1.6031851768493652, "kl": 1.8358533322811126, "learning_rate": 1.8525963652635556e-05, "loss": 0.1293, "reward": 0.9744791924953461, "reward_std": 0.2302825279533863, "rewards/accuracy_reward": 0.1000000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8744791865348815, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 634.2333557128907, "epoch": 0.2579612738038086, "grad_norm": 0.8979751467704773, "kl": 1.0591185629367827, "learning_rate": 1.852011805110188e-05, "loss": 0.1109, "reward": 1.0229166865348815, "reward_std": 0.177987564727664, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9312500119209289, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 661.3687683105469, "epoch": 0.2582813250120019, "grad_norm": 0.4129449427127838, "kl": 1.2034089416265488, "learning_rate": 1.851426180755922e-05, "loss": 0.1144, "reward": 0.9953125178813934, "reward_std": 0.15365473832935095, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791865348816, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 641.5812744140625, "epoch": 0.25860137622019524, "grad_norm": 0.3640819489955902, "kl": 1.47001773416996, "learning_rate": 1.8508394929322287e-05, "loss": 0.1258, "reward": 0.9661458551883697, "reward_std": 0.12315437085926532, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 605.7125183105469, "epoch": 0.25892142742838853, "grad_norm": 0.9221828579902649, "kl": 2.0171607047319413, "learning_rate": 1.8502517423719075e-05, "loss": 0.113, "reward": 1.0677083671092986, "reward_std": 0.16264262348413466, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083492279053, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 596.5166809082032, "epoch": 0.25924147863658187, "grad_norm": 0.5077120661735535, "kl": 1.0036138698458672, "learning_rate": 1.8496629298090855e-05, "loss": 0.0882, "reward": 0.963541692495346, "reward_std": 0.1293813869357109, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750178813935, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 584.4104309082031, "epoch": 0.25956152984477515, "grad_norm": 0.6715332269668579, "kl": 1.3229067370295524, "learning_rate": 1.8490730559792153e-05, "loss": 0.121, "reward": 1.0197916865348815, "reward_std": 0.11515746731311083, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416805744171, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 584.3958465576172, "epoch": 0.2598815810529685, "grad_norm": 0.5190712809562683, "kl": 0.5667064756155014, "learning_rate": 1.848482121619076e-05, "loss": 0.0447, "reward": 1.0526041865348816, "reward_std": 0.13494310155510902, "rewards/accuracy_reward": 0.07916667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375059604645, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 537.341683959961, "epoch": 0.2602016322611618, "grad_norm": 0.6430520415306091, "kl": 0.50772774964571, "learning_rate": 1.8478901274667716e-05, "loss": 0.0742, "reward": 1.0723958611488342, "reward_std": 0.10893401503562927, "rewards/accuracy_reward": 0.09583333935588598, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 571.8104309082031, "epoch": 0.2605216834693551, "grad_norm": 1.1371740102767944, "kl": 2.0117278814315798, "learning_rate": 1.8472970742617284e-05, "loss": 0.229, "reward": 0.9604166805744171, "reward_std": 0.18165156841278077, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9291666805744171, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 544.8437561035156, "epoch": 0.2608417346775484, "grad_norm": 0.37978023290634155, "kl": 1.3154131323099136, "learning_rate": 1.846702962744697e-05, "loss": 0.1291, "reward": 1.0223958432674407, "reward_std": 0.1484291136264801, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.951562511920929, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 585.9187683105469, "epoch": 0.26116178588574174, "grad_norm": 0.34045541286468506, "kl": 1.4208843201398849, "learning_rate": 1.8461077936577495e-05, "loss": 0.1463, "reward": 0.9494791805744172, "reward_std": 0.18640333712100982, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9369791805744171, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 508.3791809082031, "epoch": 0.261481837093935, "grad_norm": 0.4059346616268158, "kl": 1.6916437029838562, "learning_rate": 1.8455115677442782e-05, "loss": 0.1865, "reward": 1.0869791865348817, "reward_std": 0.2534780815243721, "rewards/accuracy_reward": 0.1562500052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9307291805744171, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 580.2896057128906, "epoch": 0.26180188830212836, "grad_norm": 0.6775670647621155, "kl": 2.5077103793621065, "learning_rate": 1.844914285748996e-05, "loss": 0.2779, "reward": 0.9302083492279053, "reward_std": 0.24873557239770888, "rewards/accuracy_reward": 0.03333333339542151, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8968750059604644, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 536.7562622070312, "epoch": 0.26212193951032164, "grad_norm": 0.7965701222419739, "kl": 2.863987410068512, "learning_rate": 1.8443159484179348e-05, "loss": 0.3095, "reward": 0.9828125298023224, "reward_std": 0.26197887808084486, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8703125178813934, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 525.910433959961, "epoch": 0.262441990718515, "grad_norm": 0.577987015247345, "kl": 1.5371546924114228, "learning_rate": 1.8437165564984455e-05, "loss": 0.1004, "reward": 1.0109375178813935, "reward_std": 0.2512478806078434, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.904687511920929, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 556.1666839599609, "epoch": 0.26276204192670827, "grad_norm": 0.9527766704559326, "kl": 1.177549660205841, "learning_rate": 1.8431161107391947e-05, "loss": 0.124, "reward": 0.9760416865348815, "reward_std": 0.2343311682343483, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9156250178813934, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 508.45001831054685, "epoch": 0.2630820931349016, "grad_norm": 0.35349592566490173, "kl": 2.095759892463684, "learning_rate": 1.8425146118901664e-05, "loss": 0.1814, "reward": 0.9859375298023224, "reward_std": 0.20536768585443496, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9130208551883697, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 531.160433959961, "epoch": 0.2634021443430949, "grad_norm": 1.703538179397583, "kl": 3.468520486354828, "learning_rate": 1.841912060702659e-05, "loss": 0.3395, "reward": 0.9875000298023224, "reward_std": 0.26018320918083193, "rewards/accuracy_reward": 0.09583333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8916666805744171, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 500.3896026611328, "epoch": 0.26372219555128823, "grad_norm": 0.6156812310218811, "kl": 2.648340845108032, "learning_rate": 1.8413084579292868e-05, "loss": 0.2845, "reward": 0.9500000238418579, "reward_std": 0.2134397841989994, "rewards/accuracy_reward": 0.04375000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9062500238418579, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 495.5458465576172, "epoch": 0.2640422467594815, "grad_norm": 0.5195518136024475, "kl": 0.9634673684835434, "learning_rate": 1.840703804323976e-05, "loss": 0.1202, "reward": 1.004166692495346, "reward_std": 0.20163882821798323, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520833492279053, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 483.0604278564453, "epoch": 0.26436229796767485, "grad_norm": 0.7861474752426147, "kl": 1.254196584224701, "learning_rate": 1.8400981006419663e-05, "loss": 0.1639, "reward": 1.0604166865348816, "reward_std": 0.20427689626812934, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9375000119209289, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 497.9104309082031, "epoch": 0.26468234917586814, "grad_norm": 0.278336763381958, "kl": 1.4873881816864014, "learning_rate": 1.8394913476398087e-05, "loss": 0.2505, "reward": 1.0411458611488342, "reward_std": 0.1484985716640949, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625119209289, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 483.8458526611328, "epoch": 0.2650024003840615, "grad_norm": 0.47557759284973145, "kl": 1.4743517637252808, "learning_rate": 1.838883546075365e-05, "loss": 0.1885, "reward": 1.0234375238418578, "reward_std": 0.1315991472452879, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208492279053, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 499.23126220703125, "epoch": 0.26532245159225476, "grad_norm": 0.468987375497818, "kl": 1.7570225208997727, "learning_rate": 1.8382746967078063e-05, "loss": 0.2526, "reward": 0.9932291746139527, "reward_std": 0.17668437063694, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625119209289, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 491.00001831054686, "epoch": 0.2656425028004481, "grad_norm": 0.35486987233161926, "kl": 2.0428441941738127, "learning_rate": 1.837664800297613e-05, "loss": 0.1859, "reward": 0.9739583492279053, "reward_std": 0.1656632751226425, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9364583492279053, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 512.306265258789, "epoch": 0.2659625540086414, "grad_norm": 0.7836453914642334, "kl": 1.9435631185770035, "learning_rate": 1.8370538576065725e-05, "loss": 0.1945, "reward": 1.0937500238418578, "reward_std": 0.20406704619526864, "rewards/accuracy_reward": 0.14375000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9500000178813934, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 504.570849609375, "epoch": 0.26628260521683467, "grad_norm": 0.7468709945678711, "kl": 1.106460866332054, "learning_rate": 1.8364418693977803e-05, "loss": 0.1291, "reward": 1.0890625238418579, "reward_std": 0.16735546700656415, "rewards/accuracy_reward": 0.12916667088866235, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.957812511920929, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 555.9125183105468, "epoch": 0.266602656425028, "grad_norm": 0.4349055886268616, "kl": 1.8148438930511475, "learning_rate": 1.8358288364356366e-05, "loss": 0.1068, "reward": 1.005729180574417, "reward_std": 0.14619814604520798, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625178813934, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 534.9812683105469, "epoch": 0.2669227076332213, "grad_norm": 0.6686826348304749, "kl": 1.2067111015319825, "learning_rate": 1.8352147594858474e-05, "loss": 0.1426, "reward": 1.0645833551883697, "reward_std": 0.13354990780353546, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9604166805744171, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 544.9062683105469, "epoch": 0.26724275884141463, "grad_norm": 1.4165607690811157, "kl": 2.314134883880615, "learning_rate": 1.834599639315422e-05, "loss": 0.2464, "reward": 0.9781250119209289, "reward_std": 0.17349504306912422, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9364583492279053, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 563.8916900634765, "epoch": 0.2675628100496079, "grad_norm": 0.5959430932998657, "kl": 1.5935033410787582, "learning_rate": 1.833983476692673e-05, "loss": 0.1566, "reward": 1.0067708611488342, "reward_std": 0.16007075309753419, "rewards/accuracy_reward": 0.0583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9484375178813934, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 639.4729400634766, "epoch": 0.26788286125780125, "grad_norm": 0.3902430534362793, "kl": 1.4951316177845002, "learning_rate": 1.8333662723872154e-05, "loss": 0.1353, "reward": 0.9494791865348816, "reward_std": 0.20283174850046634, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9286458492279053, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 639.3229309082031, "epoch": 0.26820291246599454, "grad_norm": 0.6958396434783936, "kl": 0.9415562689304352, "learning_rate": 1.8327480271699647e-05, "loss": 0.1359, "reward": 1.0807291924953462, "reward_std": 0.13761940076947213, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9473958432674408, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 591.2479400634766, "epoch": 0.2685229636741879, "grad_norm": 0.5779876708984375, "kl": 1.5146077901124955, "learning_rate": 1.8321287418131368e-05, "loss": 0.1672, "reward": 1.0000000178813935, "reward_std": 0.1803262263536453, "rewards/accuracy_reward": 0.0791666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9208333551883697, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 612.8604278564453, "epoch": 0.26884301488238116, "grad_norm": 0.34577852487564087, "kl": 1.463090929389, "learning_rate": 1.8315084170902473e-05, "loss": 0.1737, "reward": 0.9578125298023223, "reward_std": 0.21278849691152574, "rewards/accuracy_reward": 0.02916666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9286458611488342, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 630.8458557128906, "epoch": 0.2691630660905745, "grad_norm": 0.8705610632896423, "kl": 1.8129864871501922, "learning_rate": 1.8308870537761094e-05, "loss": 0.264, "reward": 0.9651041924953461, "reward_std": 0.26597666591405866, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.8901041865348815, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 572.7208557128906, "epoch": 0.2694831172987678, "grad_norm": 0.6193972826004028, "kl": 1.368115884065628, "learning_rate": 1.8302646526468337e-05, "loss": 0.2307, "reward": 0.9614583551883698, "reward_std": 0.22459929436445236, "rewards/accuracy_reward": 0.03333333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9281250178813935, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 619.2021087646484, "epoch": 0.2698031685069611, "grad_norm": 0.2857276499271393, "kl": 1.0940616935491563, "learning_rate": 1.8296412144798266e-05, "loss": 0.1333, "reward": 0.9843750059604645, "reward_std": 0.16745015531778334, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083373069763, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 578.3583435058594, "epoch": 0.2701232197151544, "grad_norm": 0.357234925031662, "kl": 1.360547348856926, "learning_rate": 1.829016740053791e-05, "loss": 0.1622, "reward": 0.9958333551883698, "reward_std": 0.1787314772605896, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9291666865348815, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 622.8791809082031, "epoch": 0.27044327092334774, "grad_norm": 0.5601744651794434, "kl": 1.5113209307193756, "learning_rate": 1.8283912301487228e-05, "loss": 0.2104, "reward": 0.9359375238418579, "reward_std": 0.21898051649332045, "rewards/accuracy_reward": 0.02500000074505806, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.908854192495346, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 617.1708526611328, "epoch": 0.27076332213154103, "grad_norm": 0.3600686192512512, "kl": 0.8812868297100067, "learning_rate": 1.8277646855459124e-05, "loss": 0.1488, "reward": 1.0291666984558105, "reward_std": 0.1729067787528038, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666805744171, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 572.289599609375, "epoch": 0.27108337333973437, "grad_norm": 0.2547048330307007, "kl": 1.0196032211184503, "learning_rate": 1.8271371070279418e-05, "loss": 0.0957, "reward": 1.0218750178813933, "reward_std": 0.14253914952278138, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083432674408, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 530.5541778564453, "epoch": 0.27140342454792765, "grad_norm": 0.40000978112220764, "kl": 1.0541925325989723, "learning_rate": 1.826508495378685e-05, "loss": 0.1986, "reward": 1.0296875178813933, "reward_std": 0.15220083631575107, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708492279052, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 552.2229370117187, "epoch": 0.271723475756121, "grad_norm": 0.3675740957260132, "kl": 1.61804456114769, "learning_rate": 1.825878851383305e-05, "loss": 0.16, "reward": 0.981250011920929, "reward_std": 0.1732964960858226, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9416666746139526, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 516.0250122070313, "epoch": 0.2720435269643143, "grad_norm": 0.3455909490585327, "kl": 1.393028575181961, "learning_rate": 1.8252481758282573e-05, "loss": 0.1151, "reward": 1.055729192495346, "reward_std": 0.2162807509303093, "rewards/accuracy_reward": 0.09791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125059604645, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 530.6458465576172, "epoch": 0.2723635781725076, "grad_norm": 1.0094131231307983, "kl": 2.248434340953827, "learning_rate": 1.8246164695012817e-05, "loss": 0.2087, "reward": 1.0145833611488342, "reward_std": 0.14186026901006699, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9500000178813934, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 544.0291778564454, "epoch": 0.2726836293807009, "grad_norm": 0.37523195147514343, "kl": 1.6864983469247818, "learning_rate": 1.8239837331914098e-05, "loss": 0.1855, "reward": 0.9838541805744171, "reward_std": 0.15197336673736572, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9442708492279053, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 511.55418395996094, "epoch": 0.27300368058889424, "grad_norm": 0.27413806319236755, "kl": 1.3582224547863007, "learning_rate": 1.8233499676889556e-05, "loss": 0.2058, "reward": 1.0500000178813935, "reward_std": 0.15795501098036765, "rewards/accuracy_reward": 0.10416667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333492279053, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 515.4479370117188, "epoch": 0.2733237317970875, "grad_norm": 0.6673869490623474, "kl": 0.7741886451840401, "learning_rate": 1.822715173785522e-05, "loss": 0.1256, "reward": 0.9984375238418579, "reward_std": 0.09940896760672331, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208492279053, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 513.2166778564454, "epoch": 0.27364378300528086, "grad_norm": 1.2225868701934814, "kl": 0.6319494009017944, "learning_rate": 1.8220793522739947e-05, "loss": 0.1281, "reward": 1.0994791865348816, "reward_std": 0.14179169721901416, "rewards/accuracy_reward": 0.1250000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 493.2479278564453, "epoch": 0.27396383421347414, "grad_norm": 1.0505335330963135, "kl": 1.155528011918068, "learning_rate": 1.8214425039485428e-05, "loss": 0.1561, "reward": 1.0578125357627868, "reward_std": 0.14424145892262458, "rewards/accuracy_reward": 0.10000000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125238418579, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 517.2020965576172, "epoch": 0.2742838854216675, "grad_norm": 0.7932451963424683, "kl": 2.262352053821087, "learning_rate": 1.820804629604619e-05, "loss": 0.3155, "reward": 1.0395833432674408, "reward_std": 0.19677990078926086, "rewards/accuracy_reward": 0.09583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.943750011920929, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 470.45000915527345, "epoch": 0.27460393662986077, "grad_norm": 1.4789113998413086, "kl": 2.0796320915222166, "learning_rate": 1.8201657300389563e-05, "loss": 0.2949, "reward": 1.0286458551883697, "reward_std": 0.1159073494374752, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791805744171, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 502.1958526611328, "epoch": 0.2749239878380541, "grad_norm": 0.8765280246734619, "kl": 1.5847502857446671, "learning_rate": 1.8195258060495693e-05, "loss": 0.1356, "reward": 1.0989583551883697, "reward_std": 0.13560206349939108, "rewards/accuracy_reward": 0.13333333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250178813934, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 515.400015258789, "epoch": 0.2752440390462474, "grad_norm": 1.8679522275924683, "kl": 2.9036698162555696, "learning_rate": 1.8188848584357516e-05, "loss": 0.348, "reward": 0.9890625178813934, "reward_std": 0.21013089418411254, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9036458432674408, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 460.7666809082031, "epoch": 0.27556409025444073, "grad_norm": 0.4723247289657593, "kl": 1.273580791056156, "learning_rate": 1.8182428879980754e-05, "loss": 0.1668, "reward": 0.8343750178813935, "reward_std": 0.18344238325953482, "rewards/accuracy_reward": 0.03541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.7989583551883698, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 458.6187713623047, "epoch": 0.275884141462634, "grad_norm": 0.9013276100158691, "kl": 1.1724308669567107, "learning_rate": 1.8175998955383906e-05, "loss": 0.2216, "reward": 0.9062500298023224, "reward_std": 0.19240469932556153, "rewards/accuracy_reward": 0.10208333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8041666865348815, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 448.4750122070312, "epoch": 0.27620419267082735, "grad_norm": 1.1328942775726318, "kl": 0.9504182629287243, "learning_rate": 1.8169558818598236e-05, "loss": 0.1826, "reward": 0.9463541865348816, "reward_std": 0.16272302493453025, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8942708492279052, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 458.714599609375, "epoch": 0.27652424387902064, "grad_norm": 0.8893626928329468, "kl": 0.8462207525968551, "learning_rate": 1.8163108477667762e-05, "loss": 0.1331, "reward": 0.9947917044162751, "reward_std": 0.1171018997207284, "rewards/accuracy_reward": 0.02708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083551883697, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 461.0895965576172, "epoch": 0.276844295087214, "grad_norm": 0.3679625988006592, "kl": 1.5704083681106566, "learning_rate": 1.815664794064925e-05, "loss": 0.1883, "reward": 0.9822916865348816, "reward_std": 0.1469844736158848, "rewards/accuracy_reward": 0.037500002048909666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9447916746139526, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 460.74584045410154, "epoch": 0.27716434629540726, "grad_norm": 1.254338264465332, "kl": 2.278603066504002, "learning_rate": 1.8150177215612198e-05, "loss": 0.2866, "reward": 1.0031250238418579, "reward_std": 0.2205186128616333, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9406250178813934, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 534.5208526611328, "epoch": 0.2774843975036006, "grad_norm": 0.5605323910713196, "kl": 2.108324646949768, "learning_rate": 1.8143696310638836e-05, "loss": 0.2894, "reward": 1.0187500298023224, "reward_std": 0.1810709685087204, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9395833492279053, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 467.27500610351564, "epoch": 0.2778044487117939, "grad_norm": 0.5356122255325317, "kl": 1.2343434900045396, "learning_rate": 1.81372052338241e-05, "loss": 0.238, "reward": 1.0869791865348817, "reward_std": 0.14499858394265175, "rewards/accuracy_reward": 0.1291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.957812511920929, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 477.0354339599609, "epoch": 0.2781244999199872, "grad_norm": 0.437862753868103, "kl": 1.2617906153202056, "learning_rate": 1.813070399327564e-05, "loss": 0.1205, "reward": 1.0546875178813935, "reward_std": 0.18123132549226284, "rewards/accuracy_reward": 0.10000000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.954687523841858, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 546.1562652587891, "epoch": 0.2784445511281805, "grad_norm": 0.35148391127586365, "kl": 1.2551678597927094, "learning_rate": 1.8124192597113786e-05, "loss": 0.134, "reward": 1.0380208611488342, "reward_std": 0.12462070938199758, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041865348816, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 555.0312652587891, "epoch": 0.27876460233637385, "grad_norm": 0.639610230922699, "kl": 1.409567552804947, "learning_rate": 1.8117671053471576e-05, "loss": 0.1501, "reward": 1.0369791865348816, "reward_std": 0.1435700273141265, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958492279053, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 548.8458587646485, "epoch": 0.27908465354456713, "grad_norm": 0.6962535977363586, "kl": 1.1621643796563148, "learning_rate": 1.8111139370494705e-05, "loss": 0.1272, "reward": 1.110416692495346, "reward_std": 0.12768222466111184, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500059604645, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 576.8625244140625, "epoch": 0.27940470475276047, "grad_norm": 0.45951202511787415, "kl": 0.5613536521792412, "learning_rate": 1.8104597556341538e-05, "loss": 0.0462, "reward": 1.0333333551883697, "reward_std": 0.10816633310168981, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833373069763, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 577.0979248046875, "epoch": 0.27972475596095375, "grad_norm": 0.33993226289749146, "kl": 0.5836403653025627, "learning_rate": 1.8098045619183092e-05, "loss": 0.0209, "reward": 1.0614583432674407, "reward_std": 0.08450026344507933, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083432674408, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 608.3104370117187, "epoch": 0.28004480716914704, "grad_norm": 0.23612137138843536, "kl": 0.6390758916735649, "learning_rate": 1.809148356720303e-05, "loss": 0.0365, "reward": 1.024479204416275, "reward_std": 0.10049552712589502, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 600.7541931152343, "epoch": 0.2803648583773404, "grad_norm": 1.0360605716705322, "kl": 0.8030475050210952, "learning_rate": 1.808491140859765e-05, "loss": 0.0634, "reward": 0.9723958373069763, "reward_std": 0.16076738238334656, "rewards/accuracy_reward": 0.012500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958432674408, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 619.4771118164062, "epoch": 0.28068490958553366, "grad_norm": 0.13555213809013367, "kl": 0.5969193749129772, "learning_rate": 1.8078329151575874e-05, "loss": 0.0455, "reward": 1.0635416805744171, "reward_std": 0.10078618377447128, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250059604645, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 592.8291870117188, "epoch": 0.281004960793727, "grad_norm": 0.1967499703168869, "kl": 0.7825645431876183, "learning_rate": 1.8071736804359235e-05, "loss": 0.0243, "reward": 1.0677083492279054, "reward_std": 0.0947670703753829, "rewards/accuracy_reward": 0.08333333339542151, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 613.789599609375, "epoch": 0.2813250120019203, "grad_norm": 0.24350924789905548, "kl": 1.0826184466481208, "learning_rate": 1.806513437518187e-05, "loss": 0.0796, "reward": 1.032291704416275, "reward_std": 0.15164516121149063, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 588.2604431152344, "epoch": 0.2816450632101136, "grad_norm": 0.36908820271492004, "kl": 1.1280300706624984, "learning_rate": 1.8058521872290505e-05, "loss": 0.137, "reward": 0.973437511920929, "reward_std": 0.10502424836158752, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708432674408, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 618.9083618164062, "epoch": 0.2819651144183069, "grad_norm": 0.2288147509098053, "kl": 0.812470331788063, "learning_rate": 1.8051899303944454e-05, "loss": 0.0882, "reward": 1.0593750059604645, "reward_std": 0.10690983049571515, "rewards/accuracy_reward": 0.0812500011175871, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250059604645, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 609.6250213623047, "epoch": 0.28228516562650025, "grad_norm": 0.5861449837684631, "kl": 1.3498991549015045, "learning_rate": 1.8045266678415608e-05, "loss": 0.1378, "reward": 0.9854166805744171, "reward_std": 0.14203399419784546, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 620.7187683105469, "epoch": 0.28260521683469353, "grad_norm": 1.0595533847808838, "kl": 0.9216204196214676, "learning_rate": 1.8038624003988406e-05, "loss": 0.1037, "reward": 1.014062523841858, "reward_std": 0.12942186892032623, "rewards/accuracy_reward": 0.03750000167638064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 583.2708526611328, "epoch": 0.28292526804288687, "grad_norm": 0.24561259150505066, "kl": 1.2314361870288848, "learning_rate": 1.8031971288959845e-05, "loss": 0.0694, "reward": 1.0031250238418579, "reward_std": 0.12703317496925592, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583492279053, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 601.0854370117188, "epoch": 0.28324531925108015, "grad_norm": 0.39103934168815613, "kl": 0.7185132935643196, "learning_rate": 1.8025308541639467e-05, "loss": 0.1049, "reward": 1.0197916865348815, "reward_std": 0.1216716593131423, "rewards/accuracy_reward": 0.050000001303851606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 548.1625183105468, "epoch": 0.2835653704592735, "grad_norm": 0.246460422873497, "kl": 0.48717030733823774, "learning_rate": 1.8018635770349343e-05, "loss": 0.0698, "reward": 0.9666666805744171, "reward_std": 0.11010051686316728, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 536.2646057128907, "epoch": 0.2838854216674668, "grad_norm": 0.28537264466285706, "kl": 0.6939228355884552, "learning_rate": 1.8011952983424058e-05, "loss": 0.0918, "reward": 1.0520833671092986, "reward_std": 0.17129152230918407, "rewards/accuracy_reward": 0.10208333674818278, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9500000178813934, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 506.7354278564453, "epoch": 0.2842054728756601, "grad_norm": 0.37171459197998047, "kl": 0.8684057459235192, "learning_rate": 1.800526018921072e-05, "loss": 0.0965, "reward": 0.9979166924953461, "reward_std": 0.1424863189458847, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9479166865348816, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 485.01251220703125, "epoch": 0.2845255240838534, "grad_norm": 1.3459445238113403, "kl": 0.7982180349528789, "learning_rate": 1.7998557396068923e-05, "loss": 0.0867, "reward": 1.024479192495346, "reward_std": 0.17068119421601297, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9411458492279052, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 548.414599609375, "epoch": 0.28484557529204674, "grad_norm": 0.24494709074497223, "kl": 0.787135424464941, "learning_rate": 1.7991844612370756e-05, "loss": 0.081, "reward": 1.0135416984558105, "reward_std": 0.21748648285865785, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750059604644, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 660.3875244140625, "epoch": 0.28516562650024, "grad_norm": 0.18688689172267914, "kl": 0.7876987963914871, "learning_rate": 1.798512184650079e-05, "loss": 0.042, "reward": 1.0692708492279053, "reward_std": 0.1874027382582426, "rewards/accuracy_reward": 0.10000000279396773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708373069763, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 724.6104370117188, "epoch": 0.28548567770843336, "grad_norm": 0.09229505062103271, "kl": 0.4387684382498264, "learning_rate": 1.7978389106856056e-05, "loss": 0.0362, "reward": 1.0125000119209289, "reward_std": 0.1014697566628456, "rewards/accuracy_reward": 0.02708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166686534882, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 739.0000183105469, "epoch": 0.28580572891662664, "grad_norm": 0.6484495401382446, "kl": 0.7545491896569729, "learning_rate": 1.797164640184605e-05, "loss": 0.0398, "reward": 1.051041692495346, "reward_std": 0.16586514431983232, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 768.3666870117188, "epoch": 0.28612578012482, "grad_norm": 0.2332949936389923, "kl": 1.24727663397789, "learning_rate": 1.796489373989271e-05, "loss": 0.0895, "reward": 1.034375011920929, "reward_std": 0.12449453994631768, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416746139527, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 763.9021057128906, "epoch": 0.28644583133301327, "grad_norm": 0.18527598679065704, "kl": 1.00824686139822, "learning_rate": 1.7958131129430417e-05, "loss": 0.0326, "reward": 1.0395833551883698, "reward_std": 0.10068847816437483, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 734.9166809082031, "epoch": 0.2867658825412066, "grad_norm": 0.5873371362686157, "kl": 1.7709833174943923, "learning_rate": 1.7951358578905976e-05, "loss": 0.0726, "reward": 0.9812500178813934, "reward_std": 0.14730002786964178, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333551883698, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 734.1875183105469, "epoch": 0.2870859337493999, "grad_norm": 0.6751114130020142, "kl": 1.6086274296045304, "learning_rate": 1.7944576096778595e-05, "loss": 0.1066, "reward": 0.9916666924953461, "reward_std": 0.14545521959662439, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333551883698, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 679.9854339599609, "epoch": 0.28740598495759323, "grad_norm": 0.25408291816711426, "kl": 1.0246099442243577, "learning_rate": 1.793778369151991e-05, "loss": 0.092, "reward": 1.019791692495346, "reward_std": 0.15563630759716035, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9489583492279052, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 719.8354309082031, "epoch": 0.2877260361657865, "grad_norm": 0.5261191129684448, "kl": 0.9038506269454956, "learning_rate": 1.7930981371613936e-05, "loss": 0.0331, "reward": 0.9869791865348816, "reward_std": 0.13006459400057793, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9536458492279053, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 722.0062622070312, "epoch": 0.28804608737397985, "grad_norm": 0.4196346700191498, "kl": 0.968600545823574, "learning_rate": 1.792416914555707e-05, "loss": 0.0621, "reward": 0.9843750119209289, "reward_std": 0.15532765444368124, "rewards/accuracy_reward": 0.025000000558793544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.959375011920929, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 656.9187774658203, "epoch": 0.28836613858217314, "grad_norm": 0.24890783429145813, "kl": 0.9109811738133431, "learning_rate": 1.7917347021858092e-05, "loss": 0.0637, "reward": 1.0385416984558105, "reward_std": 0.1349452082067728, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583611488342, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 647.0625122070312, "epoch": 0.2886861897903665, "grad_norm": 0.22287528216838837, "kl": 0.8536801934242249, "learning_rate": 1.791051500903814e-05, "loss": 0.0469, "reward": 0.9989583611488342, "reward_std": 0.16375069059431552, "rewards/accuracy_reward": 0.03333333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250059604645, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 688.527099609375, "epoch": 0.28900624099855976, "grad_norm": 0.8015788793563843, "kl": 1.4864769637584687, "learning_rate": 1.7903673115630703e-05, "loss": 0.0914, "reward": 1.0114583551883698, "reward_std": 0.16271494328975677, "rewards/accuracy_reward": 0.052083333395421504, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750238418579, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 681.2875183105468, "epoch": 0.2893262922067531, "grad_norm": 0.4300864040851593, "kl": 1.9600940197706223, "learning_rate": 1.7896821350181613e-05, "loss": 0.1166, "reward": 1.0031250298023224, "reward_std": 0.20148923099040986, "rewards/accuracy_reward": 0.06875000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9343750178813934, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 687.5271057128906, "epoch": 0.2896463434149464, "grad_norm": 0.2800224721431732, "kl": 1.190105938911438, "learning_rate": 1.788995972124903e-05, "loss": 0.0948, "reward": 0.9963541924953461, "reward_std": 0.1741415023803711, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.948437511920929, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 615.8479309082031, "epoch": 0.2899663946231397, "grad_norm": 0.6161094903945923, "kl": 1.6506537348031998, "learning_rate": 1.788308823740344e-05, "loss": 0.1233, "reward": 1.0333333492279053, "reward_std": 0.2054402783513069, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9354166805744171, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 651.6416931152344, "epoch": 0.290286445831333, "grad_norm": 0.40559831261634827, "kl": 1.1269910991191865, "learning_rate": 1.7876206907227628e-05, "loss": 0.0767, "reward": 1.0171875059604645, "reward_std": 0.16564912348985672, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9526041805744171, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 613.4104431152343, "epoch": 0.29060649703952635, "grad_norm": 0.8728421926498413, "kl": 1.824609386920929, "learning_rate": 1.7869315739316685e-05, "loss": 0.1396, "reward": 0.9671875178813935, "reward_std": 0.22036788761615753, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.917187511920929, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 660.864599609375, "epoch": 0.29092654824771963, "grad_norm": 0.3799927234649658, "kl": 1.3191021710634232, "learning_rate": 1.7862414742277993e-05, "loss": 0.0871, "reward": 1.0270833432674409, "reward_std": 0.1408295204862952, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 615.456265258789, "epoch": 0.29124659945591297, "grad_norm": 0.8161053657531738, "kl": 2.05251030921936, "learning_rate": 1.7855503924731205e-05, "loss": 0.2041, "reward": 1.0473958611488343, "reward_std": 0.18615373224020004, "rewards/accuracy_reward": 0.10208333861082793, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125178813935, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 582.612515258789, "epoch": 0.29156665066410625, "grad_norm": 0.3766956627368927, "kl": 1.0139563411474228, "learning_rate": 1.7848583295308236e-05, "loss": 0.1265, "reward": 0.9994791865348815, "reward_std": 0.11650126576423644, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791746139527, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 603.5229309082031, "epoch": 0.2918867018722996, "grad_norm": 0.4018208682537079, "kl": 0.8724268615245819, "learning_rate": 1.784165286265327e-05, "loss": 0.132, "reward": 1.0041666984558106, "reward_std": 0.14942692667245866, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333611488343, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 604.341683959961, "epoch": 0.2922067530804929, "grad_norm": 0.30784985423088074, "kl": 0.8771272003650665, "learning_rate": 1.7834712635422718e-05, "loss": 0.1001, "reward": 1.0505208432674409, "reward_std": 0.18126559555530547, "rewards/accuracy_reward": 0.10000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208492279053, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 619.1208557128906, "epoch": 0.2925268042886862, "grad_norm": 0.278812974691391, "kl": 0.8446752950549126, "learning_rate": 1.7827762622285245e-05, "loss": 0.1318, "reward": 1.0598958611488343, "reward_std": 0.18923650197684766, "rewards/accuracy_reward": 0.10208333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125178813934, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 608.808349609375, "epoch": 0.2928468554968795, "grad_norm": 0.15253132581710815, "kl": 0.4771121509373188, "learning_rate": 1.7820802831921723e-05, "loss": 0.06, "reward": 1.066666704416275, "reward_std": 0.12053482681512832, "rewards/accuracy_reward": 0.09166667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 608.3854370117188, "epoch": 0.29316690670507284, "grad_norm": 0.21070396900177002, "kl": 0.3982532635331154, "learning_rate": 1.7813833273025237e-05, "loss": 0.1003, "reward": 1.0447916865348816, "reward_std": 0.15599412955343722, "rewards/accuracy_reward": 0.07291666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750059604645, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 620.0041870117187, "epoch": 0.2934869579132661, "grad_norm": 0.3192347586154938, "kl": 0.7548017039895057, "learning_rate": 1.780685395430109e-05, "loss": 0.0836, "reward": 1.0635416805744171, "reward_std": 0.13123438209295274, "rewards/accuracy_reward": 0.0895833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583373069763, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 582.8812744140625, "epoch": 0.2938070091214594, "grad_norm": 0.18562725186347961, "kl": 0.5546324595808982, "learning_rate": 1.779986488446676e-05, "loss": 0.0847, "reward": 1.0500000357627868, "reward_std": 0.1341039039194584, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9604166805744171, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 631.4958557128906, "epoch": 0.29412706032965275, "grad_norm": 0.18860366940498352, "kl": 0.60446348041296, "learning_rate": 1.77928660722519e-05, "loss": 0.0673, "reward": 1.0619791805744172, "reward_std": 0.15306191500276328, "rewards/accuracy_reward": 0.0979166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9640625178813934, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 589.8270965576172, "epoch": 0.29444711153784603, "grad_norm": 0.4077078700065613, "kl": 0.9243380039930343, "learning_rate": 1.7785857526398347e-05, "loss": 0.1521, "reward": 1.0421875298023224, "reward_std": 0.18533986136317254, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541865348816, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 603.7916931152344, "epoch": 0.29476716274603937, "grad_norm": 0.17928466200828552, "kl": 0.7125317409634591, "learning_rate": 1.7778839255660087e-05, "loss": 0.1819, "reward": 1.0593750298023223, "reward_std": 0.16393165495246648, "rewards/accuracy_reward": 0.11458333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9447916865348815, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 560.7062622070313, "epoch": 0.29508721395423265, "grad_norm": 0.1269647628068924, "kl": 0.6188096687197685, "learning_rate": 1.7771811268803258e-05, "loss": 0.0549, "reward": 1.0052083551883697, "reward_std": 0.11031838692724705, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250238418579, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 594.4354400634766, "epoch": 0.295407265162426, "grad_norm": 0.4039466083049774, "kl": 1.4429447636008264, "learning_rate": 1.7764773574606124e-05, "loss": 0.1123, "reward": 1.0437500298023223, "reward_std": 0.1867046182975173, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520833551883697, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 569.133349609375, "epoch": 0.2957273163706193, "grad_norm": 0.20248201489448547, "kl": 0.6664691850543022, "learning_rate": 1.7757726181859084e-05, "loss": 0.1074, "reward": 1.0020833551883697, "reward_std": 0.10909523330628872, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 555.9187622070312, "epoch": 0.2960473675788126, "grad_norm": 0.3584718704223633, "kl": 0.9459757208824158, "learning_rate": 1.7750669099364643e-05, "loss": 0.116, "reward": 1.0421875298023224, "reward_std": 0.13381226696074008, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208492279053, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 574.2375122070313, "epoch": 0.2963674187870059, "grad_norm": 0.27927231788635254, "kl": 0.7706046402454376, "learning_rate": 1.774360233593742e-05, "loss": 0.1015, "reward": 1.0286458551883697, "reward_std": 0.13063797876238822, "rewards/accuracy_reward": 0.06250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 629.7625183105469, "epoch": 0.29668746999519924, "grad_norm": 0.26649898290634155, "kl": 1.7680400401353835, "learning_rate": 1.7736525900404114e-05, "loss": 0.1569, "reward": 1.0302083671092988, "reward_std": 0.18417428024113178, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9385416865348816, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 606.7000274658203, "epoch": 0.2970075212033925, "grad_norm": 0.4364243149757385, "kl": 1.2817148357629775, "learning_rate": 1.772943980160351e-05, "loss": 0.1445, "reward": 1.0682291984558105, "reward_std": 0.25006254613399503, "rewards/accuracy_reward": 0.12708333618938922, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9411458432674408, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 573.2708557128906, "epoch": 0.29732757241158586, "grad_norm": 0.43147382140159607, "kl": 0.8256754875183105, "learning_rate": 1.7722344048386468e-05, "loss": 0.1191, "reward": 1.0005208551883698, "reward_std": 0.15139769725501537, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541805744171, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 573.2187713623047, "epoch": 0.29764762361977914, "grad_norm": 1.0563299655914307, "kl": 1.2677388548851014, "learning_rate": 1.7715238649615893e-05, "loss": 0.1303, "reward": 1.0494791865348816, "reward_std": 0.1273749502375722, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125238418579, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 513.6125122070313, "epoch": 0.2979676748279725, "grad_norm": 0.1512734740972519, "kl": 0.6284838706254959, "learning_rate": 1.770812361416675e-05, "loss": 0.1211, "reward": 1.060937523841858, "reward_std": 0.1170524686574936, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 554.3916839599609, "epoch": 0.29828772603616577, "grad_norm": 0.1825270652770996, "kl": 0.3309263564646244, "learning_rate": 1.770099895092604e-05, "loss": 0.0311, "reward": 1.1010416924953461, "reward_std": 0.07066966965794563, "rewards/accuracy_reward": 0.11250000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416805744172, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 568.8979370117188, "epoch": 0.2986077772443591, "grad_norm": 0.3317464590072632, "kl": 0.2580555848777294, "learning_rate": 1.7693864668792785e-05, "loss": 0.0402, "reward": 1.1088541984558105, "reward_std": 0.09786607697606087, "rewards/accuracy_reward": 0.12291667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 569.3729309082031, "epoch": 0.2989278284525524, "grad_norm": 0.3599172830581665, "kl": 0.26390968188643454, "learning_rate": 1.768672077667802e-05, "loss": 0.0126, "reward": 1.0473958611488343, "reward_std": 0.1011963851749897, "rewards/accuracy_reward": 0.058333336003124715, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.989062511920929, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 540.2916870117188, "epoch": 0.29924787966074573, "grad_norm": 0.20416362583637238, "kl": 0.44174774885177615, "learning_rate": 1.767956728350479e-05, "loss": 0.0472, "reward": 1.0395833551883698, "reward_std": 0.07305270098149777, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 539.5771026611328, "epoch": 0.299567930868939, "grad_norm": 0.30982765555381775, "kl": 0.47428609281778333, "learning_rate": 1.7672404198208123e-05, "loss": 0.0426, "reward": 1.1197916984558105, "reward_std": 0.1203194510191679, "rewards/accuracy_reward": 0.1437500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416865348815, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 534.5479339599609, "epoch": 0.29988798207713235, "grad_norm": 0.11249273270368576, "kl": 0.6478874146938324, "learning_rate": 1.7665231529735042e-05, "loss": 0.0666, "reward": 1.0703125238418578, "reward_std": 0.10500245019793511, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 602.2646026611328, "epoch": 0.30020803328532564, "grad_norm": 1.0702687501907349, "kl": 0.7110832586884499, "learning_rate": 1.765804928704452e-05, "loss": 0.1257, "reward": 1.0927083611488342, "reward_std": 0.12243700325489044, "rewards/accuracy_reward": 0.11875000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 609.7104431152344, "epoch": 0.300528084493519, "grad_norm": 0.1390598863363266, "kl": 0.5671024739742279, "learning_rate": 1.7650857479107507e-05, "loss": 0.1166, "reward": 0.9692708551883698, "reward_std": 0.10575719326734542, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708551883698, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 601.1354309082031, "epoch": 0.30084813570171226, "grad_norm": 0.3185669779777527, "kl": 0.6791951522231102, "learning_rate": 1.7643656114906895e-05, "loss": 0.0587, "reward": 1.0505208492279052, "reward_std": 0.16533472537994384, "rewards/accuracy_reward": 0.08541666846722365, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 596.7916870117188, "epoch": 0.3011681869099056, "grad_norm": 0.40729889273643494, "kl": 1.093244832754135, "learning_rate": 1.7636445203437503e-05, "loss": 0.1655, "reward": 1.0260416865348816, "reward_std": 0.1972845211625099, "rewards/accuracy_reward": 0.08333333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083492279053, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 676.4041870117187, "epoch": 0.3014882381180989, "grad_norm": 0.0986812561750412, "kl": 0.3631768196821213, "learning_rate": 1.7629224753706088e-05, "loss": 0.0565, "reward": 1.027604192495346, "reward_std": 0.09776556696742773, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041865348816, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 645.5583618164062, "epoch": 0.3018082893262922, "grad_norm": 0.2628479599952698, "kl": 0.47837393432855607, "learning_rate": 1.762199477473131e-05, "loss": 0.087, "reward": 1.067187523841858, "reward_std": 0.11832643263041973, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 618.8875183105469, "epoch": 0.3021283405344855, "grad_norm": 0.175709068775177, "kl": 0.9550945192575455, "learning_rate": 1.7614755275543748e-05, "loss": 0.1663, "reward": 1.0333333671092988, "reward_std": 0.17842126339673997, "rewards/accuracy_reward": 0.08750000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9458333611488342, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 654.9562683105469, "epoch": 0.30244839174267885, "grad_norm": 0.34556370973587036, "kl": 0.4930610120296478, "learning_rate": 1.7607506265185846e-05, "loss": 0.0986, "reward": 0.9947916805744171, "reward_std": 0.1405083030462265, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083432674408, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 672.8104431152344, "epoch": 0.30276844295087213, "grad_norm": 0.26512089371681213, "kl": 0.7804930925369262, "learning_rate": 1.7600247752711952e-05, "loss": 0.0917, "reward": 1.0041666865348815, "reward_std": 0.17392733693122864, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9333333551883698, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 725.4125183105468, "epoch": 0.30308849415906547, "grad_norm": 0.1472983956336975, "kl": 0.7122527778148651, "learning_rate": 1.759297974718827e-05, "loss": 0.1153, "reward": 1.071875023841858, "reward_std": 0.19012432545423508, "rewards/accuracy_reward": 0.12500000204890965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9468750119209289, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 667.7604370117188, "epoch": 0.30340854536725875, "grad_norm": 0.2633792757987976, "kl": 0.8807353228330612, "learning_rate": 1.7585702257692863e-05, "loss": 0.1206, "reward": 1.0348958611488341, "reward_std": 0.20192356854677201, "rewards/accuracy_reward": 0.09583333656191825, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625119209289, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 692.2646057128907, "epoch": 0.3037285965754521, "grad_norm": 0.5958443880081177, "kl": 1.3553169280290605, "learning_rate": 1.7578415293315646e-05, "loss": 0.1455, "reward": 0.977604192495346, "reward_std": 0.2028628334403038, "rewards/accuracy_reward": 0.06041667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9171875238418579, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 694.8229309082031, "epoch": 0.3040486477836454, "grad_norm": 0.32592839002609253, "kl": 1.444900530576706, "learning_rate": 1.7571118863158355e-05, "loss": 0.1701, "reward": 1.0734375357627868, "reward_std": 0.26215763986110685, "rewards/accuracy_reward": 0.1791666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8942708492279052, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 697.8479431152343, "epoch": 0.3043686989918387, "grad_norm": 0.2379060685634613, "kl": 1.5481299102306365, "learning_rate": 1.756381297633457e-05, "loss": 0.1559, "reward": 1.008854192495346, "reward_std": 0.24572829753160477, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8734375178813935, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 731.483349609375, "epoch": 0.304688750200032, "grad_norm": 0.17579278349876404, "kl": 1.133284804224968, "learning_rate": 1.7556497641969658e-05, "loss": 0.0919, "reward": 1.068229192495346, "reward_std": 0.21461983472108842, "rewards/accuracy_reward": 0.15000000298023225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9182291865348816, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 723.1666870117188, "epoch": 0.30500880140822534, "grad_norm": 0.3515166938304901, "kl": 1.3535830855369568, "learning_rate": 1.754917286920081e-05, "loss": 0.112, "reward": 0.9218750119209289, "reward_std": 0.24458130151033403, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.8822916805744171, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 713.7125183105469, "epoch": 0.3053288526164186, "grad_norm": 0.20482727885246277, "kl": 1.2124733626842499, "learning_rate": 1.7541838667176993e-05, "loss": 0.0829, "reward": 0.9005208492279053, "reward_std": 0.23665229380130767, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8901041924953461, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 705.5625183105469, "epoch": 0.30564890382461196, "grad_norm": 0.29011431336402893, "kl": 1.3561316847801208, "learning_rate": 1.7534495045058947e-05, "loss": 0.106, "reward": 0.9645833551883698, "reward_std": 0.21052157506346703, "rewards/accuracy_reward": 0.0770833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8875000238418579, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 702.7250244140625, "epoch": 0.30596895503280525, "grad_norm": 0.3863130807876587, "kl": 1.455906194448471, "learning_rate": 1.7527142012019193e-05, "loss": 0.1285, "reward": 0.8713541865348816, "reward_std": 0.2652384236454964, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8692708551883698, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 808.6521118164062, "epoch": 0.3062890062409986, "grad_norm": 0.18241232633590698, "kl": 1.7860671520233153, "learning_rate": 1.7519779577241993e-05, "loss": 0.1083, "reward": 0.9317708492279053, "reward_std": 0.27449193596839905, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8630208432674408, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 751.5437744140625, "epoch": 0.30660905744919187, "grad_norm": 0.32234227657318115, "kl": 1.7606966257095338, "learning_rate": 1.751240774992336e-05, "loss": 0.134, "reward": 0.9302083492279053, "reward_std": 0.27238035053014753, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8614583432674408, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 787.4166870117188, "epoch": 0.3069291086573852, "grad_norm": 0.6801765561103821, "kl": 2.5121779322624205, "learning_rate": 1.7505026539271038e-05, "loss": 0.1615, "reward": 0.9062500298023224, "reward_std": 0.3029158145189285, "rewards/accuracy_reward": 0.050000001303851606, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.8562500178813934, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 770.1125244140625, "epoch": 0.3072491598655785, "grad_norm": 0.19171766936779022, "kl": 1.4077009975910186, "learning_rate": 1.7497635954504487e-05, "loss": 0.0902, "reward": 0.9135416805744171, "reward_std": 0.2074896477162838, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9093750178813934, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 772.3312744140625, "epoch": 0.30756921107377183, "grad_norm": 0.27294042706489563, "kl": 0.896188372373581, "learning_rate": 1.749023600485488e-05, "loss": 0.0352, "reward": 1.0109375238418579, "reward_std": 0.19567719101905823, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.935937511920929, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 711.4562683105469, "epoch": 0.3078892622819651, "grad_norm": 0.16912463307380676, "kl": 0.9790063366293907, "learning_rate": 1.7482826699565083e-05, "loss": 0.0763, "reward": 0.9838541805744171, "reward_std": 0.19945336878299713, "rewards/accuracy_reward": 0.05416666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9296875119209289, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 726.5479431152344, "epoch": 0.3082093134901584, "grad_norm": 0.1460140198469162, "kl": 0.7135851427912712, "learning_rate": 1.747540804788965e-05, "loss": 0.0191, "reward": 0.962500023841858, "reward_std": 0.13189447149634362, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 743.6562683105469, "epoch": 0.30852936469835174, "grad_norm": 0.13990092277526855, "kl": 0.574808469414711, "learning_rate": 1.7467980059094817e-05, "loss": 0.0155, "reward": 0.9984375178813935, "reward_std": 0.11035698503255845, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9630208432674408, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 712.4729370117187, "epoch": 0.308849415906545, "grad_norm": 0.2480141520500183, "kl": 0.6258080065250397, "learning_rate": 1.7460542742458464e-05, "loss": 0.056, "reward": 1.0510416984558106, "reward_std": 0.1838358849287033, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416746139527, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 716.4937744140625, "epoch": 0.30916946711473836, "grad_norm": 0.2927420139312744, "kl": 0.6674822881817818, "learning_rate": 1.745309610727014e-05, "loss": 0.0581, "reward": 1.0708333492279052, "reward_std": 0.09178536143153906, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 728.0312744140625, "epoch": 0.30948951832293164, "grad_norm": 0.0917883962392807, "kl": 0.24520479291677474, "learning_rate": 1.744564016283102e-05, "loss": 0.0244, "reward": 1.0692708551883698, "reward_std": 0.08112927377223969, "rewards/accuracy_reward": 0.08333333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 743.9187622070312, "epoch": 0.309809569531125, "grad_norm": 0.0622573047876358, "kl": 0.3754092678427696, "learning_rate": 1.7438174918453916e-05, "loss": 0.0386, "reward": 1.0364583611488343, "reward_std": 0.13707383908331394, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 680.3625244140625, "epoch": 0.31012962073931827, "grad_norm": 0.1604725569486618, "kl": 0.5867097809910774, "learning_rate": 1.7430700383463253e-05, "loss": 0.0852, "reward": 1.058854192495346, "reward_std": 0.20737518668174743, "rewards/accuracy_reward": 0.10833333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208492279053, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 727.0729370117188, "epoch": 0.3104496719475116, "grad_norm": 0.10366953909397125, "kl": 0.4695418193936348, "learning_rate": 1.742321656719506e-05, "loss": 0.0847, "reward": 1.009375023841858, "reward_std": 0.12343942523002624, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 663.1896118164062, "epoch": 0.3107697231557049, "grad_norm": 0.37568023800849915, "kl": 0.9744888663291931, "learning_rate": 1.7415723478996955e-05, "loss": 0.1292, "reward": 1.0635416865348817, "reward_std": 0.1882859192788601, "rewards/accuracy_reward": 0.1145833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9489583432674408, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 687.7958435058594, "epoch": 0.31108977436389823, "grad_norm": 0.05901394784450531, "kl": 0.26913181617856025, "learning_rate": 1.7408221128228145e-05, "loss": 0.0557, "reward": 1.0322916865348817, "reward_std": 0.10280282869935035, "rewards/accuracy_reward": 0.052083333395421504, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083432674408, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 661.0646057128906, "epoch": 0.3114098255720915, "grad_norm": 0.12370312213897705, "kl": 0.5913057863712311, "learning_rate": 1.74007095242594e-05, "loss": 0.1314, "reward": 1.0161458492279052, "reward_std": 0.1552325375378132, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791805744172, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 639.9771057128906, "epoch": 0.31172987678028485, "grad_norm": 0.313650906085968, "kl": 0.38522453233599663, "learning_rate": 1.7393188676473053e-05, "loss": 0.0671, "reward": 1.0052083551883697, "reward_std": 0.0870123527944088, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 698.7041748046875, "epoch": 0.31204992798847814, "grad_norm": 0.10242673009634018, "kl": 0.7089316248893738, "learning_rate": 1.738565859426297e-05, "loss": 0.0851, "reward": 0.9661458551883697, "reward_std": 0.1504125714302063, "rewards/accuracy_reward": 0.010416667163372039, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 662.2646118164063, "epoch": 0.3123699791966715, "grad_norm": 0.1471061110496521, "kl": 0.5538154274225235, "learning_rate": 1.737811928703457e-05, "loss": 0.0833, "reward": 1.0546875178813935, "reward_std": 0.1336808368563652, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208492279053, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 687.2437683105469, "epoch": 0.31269003040486476, "grad_norm": 0.10087238997220993, "kl": 0.48820848688483237, "learning_rate": 1.7370570764204788e-05, "loss": 0.0999, "reward": 1.0343750298023224, "reward_std": 0.15432624202221631, "rewards/accuracy_reward": 0.0666666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 639.7041870117188, "epoch": 0.3130100816130581, "grad_norm": 0.22208651900291443, "kl": 0.6460809573531151, "learning_rate": 1.7363013035202058e-05, "loss": 0.0418, "reward": 1.0875000178813934, "reward_std": 0.1667162150144577, "rewards/accuracy_reward": 0.12083333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666805744171, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 620.1083557128907, "epoch": 0.3133301328212514, "grad_norm": 0.139918714761734, "kl": 0.7306598663330078, "learning_rate": 1.7355446109466326e-05, "loss": 0.1177, "reward": 1.0744791984558106, "reward_std": 0.2074648855254054, "rewards/accuracy_reward": 0.12916667219251393, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9453125119209289, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 622.9604309082031, "epoch": 0.3136501840294447, "grad_norm": 0.18447650969028473, "kl": 0.7331676751375198, "learning_rate": 1.734786999644902e-05, "loss": 0.1319, "reward": 1.017187523841858, "reward_std": 0.15683282688260078, "rewards/accuracy_reward": 0.05833333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541805744171, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 581.8250122070312, "epoch": 0.313970235237638, "grad_norm": 0.1246795505285263, "kl": 0.5896144509315491, "learning_rate": 1.7340284705613045e-05, "loss": 0.0758, "reward": 1.0437500178813934, "reward_std": 0.10564655810594559, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 639.4041809082031, "epoch": 0.31429028644583135, "grad_norm": 0.27654311060905457, "kl": 0.4418774448335171, "learning_rate": 1.7332690246432774e-05, "loss": 0.0852, "reward": 1.084895873069763, "reward_std": 0.14493267983198166, "rewards/accuracy_reward": 0.11458333805203438, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.970312523841858, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 620.7979370117188, "epoch": 0.31461033765402463, "grad_norm": 0.18854264914989471, "kl": 1.0708917260169983, "learning_rate": 1.7325086628394017e-05, "loss": 0.1695, "reward": 1.1531250417232513, "reward_std": 0.19459521472454072, "rewards/accuracy_reward": 0.20833334028720857, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9447916746139526, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 643.2104370117188, "epoch": 0.31493038886221797, "grad_norm": 0.4541122615337372, "kl": 1.2559853374958039, "learning_rate": 1.731747386099404e-05, "loss": 0.1223, "reward": 0.947916692495346, "reward_std": 0.21363041847944259, "rewards/accuracy_reward": 0.025000001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9229166984558106, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 641.2625152587891, "epoch": 0.31525044007041125, "grad_norm": 0.5043960809707642, "kl": 1.4750530004501343, "learning_rate": 1.7309851953741532e-05, "loss": 0.1493, "reward": 0.9682291924953461, "reward_std": 0.20985282510519027, "rewards/accuracy_reward": 0.03750000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9307291865348816, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 613.8229370117188, "epoch": 0.3155704912786046, "grad_norm": 0.5190443992614746, "kl": 1.9234457969665528, "learning_rate": 1.7302220916156592e-05, "loss": 0.2225, "reward": 0.9427083551883697, "reward_std": 0.2576408013701439, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9010416865348816, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 639.17294921875, "epoch": 0.3158905424867979, "grad_norm": 0.2320450246334076, "kl": 1.2377650499343873, "learning_rate": 1.7294580757770725e-05, "loss": 0.141, "reward": 0.9635416865348816, "reward_std": 0.21727037131786348, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9156250119209289, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 629.5395935058593, "epoch": 0.3162105936949912, "grad_norm": 0.1871139407157898, "kl": 1.1846880629658698, "learning_rate": 1.728693148812684e-05, "loss": 0.1154, "reward": 0.9864583730697631, "reward_std": 0.1802637368440628, "rewards/accuracy_reward": 0.052083336375653745, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9343750238418579, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 597.0541809082031, "epoch": 0.3165306449031845, "grad_norm": 0.4449709951877594, "kl": 0.8319399744272232, "learning_rate": 1.727927311677921e-05, "loss": 0.1479, "reward": 1.0192708551883698, "reward_std": 0.1877690449357033, "rewards/accuracy_reward": 0.07083333432674407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9484375178813934, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 621.6937683105468, "epoch": 0.31685069611137784, "grad_norm": 0.3338225483894348, "kl": 1.130355241894722, "learning_rate": 1.7271605653293486e-05, "loss": 0.122, "reward": 0.9359375178813935, "reward_std": 0.2035887584090233, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9317708551883698, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 617.0375244140625, "epoch": 0.3171707473195711, "grad_norm": 0.15540385246276855, "kl": 0.9231228500604629, "learning_rate": 1.7263929107246672e-05, "loss": 0.1501, "reward": 1.0187500238418579, "reward_std": 0.15587160028517247, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9479166865348816, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 627.5458557128907, "epoch": 0.31749079852776446, "grad_norm": 0.11945409327745438, "kl": 0.8397936165332794, "learning_rate": 1.725624348822712e-05, "loss": 0.1723, "reward": 1.0463541924953461, "reward_std": 0.15689616054296493, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541865348816, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 587.2479370117187, "epoch": 0.31781084973595775, "grad_norm": 0.474109411239624, "kl": 1.2607935786247253, "learning_rate": 1.7248548805834512e-05, "loss": 0.2222, "reward": 0.9911458611488342, "reward_std": 0.2234620615839958, "rewards/accuracy_reward": 0.06666666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9244791865348816, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 594.1166900634765, "epoch": 0.3181309009441511, "grad_norm": 0.33378320932388306, "kl": 1.0174303948879242, "learning_rate": 1.724084506967985e-05, "loss": 0.1641, "reward": 1.0338541984558105, "reward_std": 0.2083968624472618, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.935937511920929, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 542.108349609375, "epoch": 0.31845095215234437, "grad_norm": 0.20785702764987946, "kl": 0.7034070655703545, "learning_rate": 1.723313228938545e-05, "loss": 0.1507, "reward": 0.9697916805744171, "reward_std": 0.19483174681663512, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9322916746139527, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 548.8812622070312, "epoch": 0.3187710033605377, "grad_norm": 0.26794859766960144, "kl": 0.9634685277938843, "learning_rate": 1.7225410474584907e-05, "loss": 0.1563, "reward": 1.052604192495346, "reward_std": 0.19138510003685952, "rewards/accuracy_reward": 0.11875000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9338541805744172, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 604.5541839599609, "epoch": 0.319091054568731, "grad_norm": 0.25083127617836, "kl": 1.1079894408583641, "learning_rate": 1.721767963492313e-05, "loss": 0.2077, "reward": 0.9994791984558106, "reward_std": 0.22150392681360245, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9161458551883698, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 542.695849609375, "epoch": 0.31941110577692433, "grad_norm": 0.21013391017913818, "kl": 0.9574685275554657, "learning_rate": 1.7209939780056273e-05, "loss": 0.1939, "reward": 1.0083333551883698, "reward_std": 0.16795330494642258, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9375000178813935, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 551.3500213623047, "epoch": 0.3197311569851176, "grad_norm": 0.11133457720279694, "kl": 0.6998517155647278, "learning_rate": 1.7202190919651764e-05, "loss": 0.1522, "reward": 1.0833333611488343, "reward_std": 0.14614312946796418, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9479166865348816, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 568.8375183105469, "epoch": 0.32005120819331095, "grad_norm": 0.16730454564094543, "kl": 1.1215898275375367, "learning_rate": 1.7194433063388273e-05, "loss": 0.2736, "reward": 1.0567708611488342, "reward_std": 0.23038013577461242, "rewards/accuracy_reward": 0.13125000447034835, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9255208492279052, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 531.7354278564453, "epoch": 0.32037125940150424, "grad_norm": 0.2715721130371094, "kl": 1.28203387260437, "learning_rate": 1.718666622095572e-05, "loss": 0.1875, "reward": 1.0302083551883698, "reward_std": 0.17592437490820884, "rewards/accuracy_reward": 0.09166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9385416865348816, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 545.2479370117187, "epoch": 0.3206913106096976, "grad_norm": 0.28725120425224304, "kl": 0.7281457930803299, "learning_rate": 1.7178890402055232e-05, "loss": 0.1394, "reward": 0.9776041865348816, "reward_std": 0.1258978858590126, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541805744171, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 520.7625122070312, "epoch": 0.32101136181789086, "grad_norm": 0.12669086456298828, "kl": 0.750990717113018, "learning_rate": 1.7171105616399153e-05, "loss": 0.1796, "reward": 1.0588541865348815, "reward_std": 0.1406536651775241, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041805744171, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 546.8916839599609, "epoch": 0.3213314130260842, "grad_norm": 0.20147007703781128, "kl": 0.889993640780449, "learning_rate": 1.7163311873711035e-05, "loss": 0.1543, "reward": 1.0171875178813934, "reward_std": 0.15972171053290368, "rewards/accuracy_reward": 0.060416667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708492279052, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 503.09376220703126, "epoch": 0.3216514642342775, "grad_norm": 0.25828316807746887, "kl": 0.7611904472112656, "learning_rate": 1.7155509183725607e-05, "loss": 0.177, "reward": 1.0395833611488343, "reward_std": 0.18584888130426408, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.956250011920929, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 492.1937622070312, "epoch": 0.32197151544247077, "grad_norm": 0.11577334254980087, "kl": 0.6240782648324966, "learning_rate": 1.714769755618878e-05, "loss": 0.1137, "reward": 1.0635416805744171, "reward_std": 0.1440664477646351, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916746139527, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 465.85625915527345, "epoch": 0.3222915666506641, "grad_norm": 0.2497272938489914, "kl": 0.3379806771874428, "learning_rate": 1.7139877000857623e-05, "loss": 0.0777, "reward": 1.0541666865348815, "reward_std": 0.0930209718644619, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 455.9854248046875, "epoch": 0.3226116178588574, "grad_norm": 0.35958606004714966, "kl": 0.5691968247294426, "learning_rate": 1.7132047527500366e-05, "loss": 0.1087, "reward": 1.0572916984558105, "reward_std": 0.1227844811975956, "rewards/accuracy_reward": 0.08333333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 481.6020935058594, "epoch": 0.32293166906705073, "grad_norm": 0.1607331484556198, "kl": 0.38854978755116465, "learning_rate": 1.712420914589637e-05, "loss": 0.1146, "reward": 1.0718750178813934, "reward_std": 0.10704346112906933, "rewards/accuracy_reward": 0.09791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 452.42918395996094, "epoch": 0.323251720275244, "grad_norm": 0.21614070236682892, "kl": 0.5235861442983151, "learning_rate": 1.711636186583612e-05, "loss": 0.1206, "reward": 1.1130208551883698, "reward_std": 0.09185979887843132, "rewards/accuracy_reward": 0.1312500059604645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 471.1458465576172, "epoch": 0.32357177148343735, "grad_norm": 0.21211141347885132, "kl": 0.5399421505630017, "learning_rate": 1.710850569712123e-05, "loss": 0.0784, "reward": 1.1104166984558106, "reward_std": 0.11394599229097366, "rewards/accuracy_reward": 0.1270833384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 456.9541809082031, "epoch": 0.32389182269163064, "grad_norm": 0.15681226551532745, "kl": 0.43956211805343626, "learning_rate": 1.7100640649564396e-05, "loss": 0.0954, "reward": 1.042187511920929, "reward_std": 0.07975867688655854, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208432674408, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 470.1875061035156, "epoch": 0.324211873899824, "grad_norm": 0.468483567237854, "kl": 0.8010672204196453, "learning_rate": 1.7092766732989418e-05, "loss": 0.1121, "reward": 1.0697916865348815, "reward_std": 0.12852012366056442, "rewards/accuracy_reward": 0.08958333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083373069763, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 453.1958465576172, "epoch": 0.32453192510801726, "grad_norm": 0.21684125065803528, "kl": 0.39564828500151633, "learning_rate": 1.708488395723117e-05, "loss": 0.0683, "reward": 1.0682291865348816, "reward_std": 0.09386988766491414, "rewards/accuracy_reward": 0.08125000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791746139527, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 459.4958435058594, "epoch": 0.3248519763162106, "grad_norm": 0.23216085135936737, "kl": 0.6974504925310612, "learning_rate": 1.7076992332135595e-05, "loss": 0.1328, "reward": 1.0578125178813935, "reward_std": 0.11282338351011276, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 478.4666748046875, "epoch": 0.3251720275244039, "grad_norm": 0.16336947679519653, "kl": 0.4402541309595108, "learning_rate": 1.7069091867559687e-05, "loss": 0.0909, "reward": 1.0713541865348817, "reward_std": 0.09093062989413739, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 476.71250915527344, "epoch": 0.3254920787325972, "grad_norm": 0.24683783948421478, "kl": 0.5640803650021553, "learning_rate": 1.706118257337148e-05, "loss": 0.0919, "reward": 1.0421875238418579, "reward_std": 0.1091598778963089, "rewards/accuracy_reward": 0.06458333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 482.1416839599609, "epoch": 0.3258121299407905, "grad_norm": 0.18254989385604858, "kl": 0.44138511940836905, "learning_rate": 1.7053264459450023e-05, "loss": 0.1237, "reward": 1.018229180574417, "reward_std": 0.08106882330030203, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 421.80626220703124, "epoch": 0.32613218114898385, "grad_norm": 0.1619545817375183, "kl": 0.5294121131300926, "learning_rate": 1.7045337535685414e-05, "loss": 0.0465, "reward": 1.0416666984558105, "reward_std": 0.144361755810678, "rewards/accuracy_reward": 0.08333333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333432674408, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 478.7354309082031, "epoch": 0.32645223235717713, "grad_norm": 0.18814796209335327, "kl": 0.449739009141922, "learning_rate": 1.7037401811978726e-05, "loss": 0.0714, "reward": 1.0338541984558105, "reward_std": 0.1286115448921919, "rewards/accuracy_reward": 0.06250000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 460.3479278564453, "epoch": 0.32677228356537047, "grad_norm": 0.45165371894836426, "kl": 0.5484678715467453, "learning_rate": 1.7029457298242035e-05, "loss": 0.1563, "reward": 1.1328125298023224, "reward_std": 0.14226205535233022, "rewards/accuracy_reward": 0.17291667312383652, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958551883697, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 457.78126220703126, "epoch": 0.32709233477356375, "grad_norm": 0.34666770696640015, "kl": 0.6640940323472023, "learning_rate": 1.7021504004398392e-05, "loss": 0.1551, "reward": 1.0348958611488341, "reward_std": 0.11599632911384106, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9640625238418579, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 464.120849609375, "epoch": 0.3274123859817571, "grad_norm": 0.261369913816452, "kl": 1.0041472047567368, "learning_rate": 1.7013541940381824e-05, "loss": 0.2455, "reward": 1.0723958730697631, "reward_std": 0.22422086521983148, "rewards/accuracy_reward": 0.12916666977107524, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9432291805744171, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 450.2083435058594, "epoch": 0.3277324371899504, "grad_norm": 0.15666531026363373, "kl": 0.6061700366437435, "learning_rate": 1.70055711161373e-05, "loss": 0.1381, "reward": 1.0192708671092987, "reward_std": 0.12545478213578462, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 473.61667785644534, "epoch": 0.3280524883981437, "grad_norm": 0.3670523464679718, "kl": 0.8996363550424575, "learning_rate": 1.6997591541620734e-05, "loss": 0.1796, "reward": 1.0375000298023225, "reward_std": 0.17077935561537744, "rewards/accuracy_reward": 0.07708333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166746139526, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 495.0062622070312, "epoch": 0.328372539606337, "grad_norm": 0.16689921915531158, "kl": 0.8198397219181061, "learning_rate": 1.6989603226798976e-05, "loss": 0.2232, "reward": 1.0385416865348815, "reward_std": 0.17929813712835313, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916746139526, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 492.21250915527344, "epoch": 0.32869259081453034, "grad_norm": 0.38878318667411804, "kl": 1.0202806413173675, "learning_rate": 1.698160618164979e-05, "loss": 0.2086, "reward": 1.0447916984558105, "reward_std": 0.16940562725067138, "rewards/accuracy_reward": 0.09791667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9468750119209289, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 459.2270965576172, "epoch": 0.3290126420227236, "grad_norm": 0.21051208674907684, "kl": 0.7796215415000916, "learning_rate": 1.6973600416161842e-05, "loss": 0.1823, "reward": 1.0338541865348816, "reward_std": 0.15895916149020195, "rewards/accuracy_reward": 0.07291666828095913, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9588541805744171, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 479.7583465576172, "epoch": 0.32933269323091696, "grad_norm": 0.3499182462692261, "kl": 0.8251389652490616, "learning_rate": 1.6965585940334688e-05, "loss": 0.2098, "reward": 1.0192708611488341, "reward_std": 0.14868669509887694, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041865348815, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 489.2979309082031, "epoch": 0.32965274443911025, "grad_norm": 0.32130110263824463, "kl": 0.570948114991188, "learning_rate": 1.6957562764178774e-05, "loss": 0.1494, "reward": 1.009375011920929, "reward_std": 0.10973553471267224, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 480.38543701171875, "epoch": 0.3299727956473036, "grad_norm": 0.43677008152008057, "kl": 0.7646385207772255, "learning_rate": 1.69495308977154e-05, "loss": 0.2192, "reward": 1.1062500298023223, "reward_std": 0.15374659057706594, "rewards/accuracy_reward": 0.1479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 482.4104339599609, "epoch": 0.33029284685549687, "grad_norm": 0.2866804003715515, "kl": 0.9388459503650666, "learning_rate": 1.694149035097673e-05, "loss": 0.2006, "reward": 1.002604180574417, "reward_std": 0.16000167801976203, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9526041805744171, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 484.7291778564453, "epoch": 0.3306128980636902, "grad_norm": 0.2829350531101227, "kl": 1.3405283033847808, "learning_rate": 1.6933441134005774e-05, "loss": 0.3405, "reward": 1.133854216337204, "reward_std": 0.1917470723390579, "rewards/accuracy_reward": 0.19583334028720856, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9380208551883698, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 484.714599609375, "epoch": 0.3309329492718835, "grad_norm": 0.4547743499279022, "kl": 1.0454600259661675, "learning_rate": 1.692538325685635e-05, "loss": 0.274, "reward": 1.113541692495346, "reward_std": 0.18550372272729873, "rewards/accuracy_reward": 0.177083339355886, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9343750238418579, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 476.5916778564453, "epoch": 0.33125300048007683, "grad_norm": 0.17376388609409332, "kl": 0.7498746126890182, "learning_rate": 1.6917316729593115e-05, "loss": 0.1823, "reward": 1.0651041924953462, "reward_std": 0.1255657471716404, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375059604645, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 468.60418395996095, "epoch": 0.3315730516882701, "grad_norm": 0.2241523712873459, "kl": 0.6255024075508118, "learning_rate": 1.6909241562291522e-05, "loss": 0.2153, "reward": 1.0432291865348815, "reward_std": 0.1870565339922905, "rewards/accuracy_reward": 0.08750000279396772, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 482.62500915527346, "epoch": 0.33189310289646345, "grad_norm": 0.3327322006225586, "kl": 0.9747333094477654, "learning_rate": 1.690115776503782e-05, "loss": 0.1304, "reward": 1.1036458492279053, "reward_std": 0.15871551111340523, "rewards/accuracy_reward": 0.14791667107492684, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 485.9416839599609, "epoch": 0.33221315410465674, "grad_norm": 0.29658105969429016, "kl": 1.1347735792398452, "learning_rate": 1.689306534792903e-05, "loss": 0.1945, "reward": 1.101562535762787, "reward_std": 0.19802000969648362, "rewards/accuracy_reward": 0.14583333842456342, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291865348816, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 488.0104248046875, "epoch": 0.3325332053128501, "grad_norm": 0.30956873297691345, "kl": 0.9602329656481743, "learning_rate": 1.6884964321072938e-05, "loss": 0.1944, "reward": 1.004687523841858, "reward_std": 0.17770743370056152, "rewards/accuracy_reward": 0.05208333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041805744171, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 461.8354248046875, "epoch": 0.33285325652104336, "grad_norm": 0.2959546446800232, "kl": 0.6376917466521264, "learning_rate": 1.68768546945881e-05, "loss": 0.1583, "reward": 1.0161458551883698, "reward_std": 0.12123525217175483, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 472.9666809082031, "epoch": 0.3331733077292367, "grad_norm": 0.13327783346176147, "kl": 0.6734405755996704, "learning_rate": 1.68687364786038e-05, "loss": 0.1282, "reward": 1.1036458551883697, "reward_std": 0.168779456615448, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791746139527, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 492.8271026611328, "epoch": 0.33349335893743, "grad_norm": 0.1860327571630478, "kl": 0.4504072442650795, "learning_rate": 1.686060968326005e-05, "loss": 0.1078, "reward": 1.066666692495346, "reward_std": 0.10958906393498183, "rewards/accuracy_reward": 0.08958333693444728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 473.4041778564453, "epoch": 0.3338134101456233, "grad_norm": 0.24637283384799957, "kl": 1.1397089630365371, "learning_rate": 1.685247431870758e-05, "loss": 0.1869, "reward": 0.9864583492279053, "reward_std": 0.13785818926990032, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250119209289, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 473.10834350585935, "epoch": 0.3341334613538166, "grad_norm": 0.14904294908046722, "kl": 0.3731867730617523, "learning_rate": 1.6844330395107825e-05, "loss": 0.112, "reward": 1.0729166984558105, "reward_std": 0.13176908865571021, "rewards/accuracy_reward": 0.09375000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666865348816, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 472.1208465576172, "epoch": 0.33445351256200995, "grad_norm": 0.06686241179704666, "kl": 0.30279314517974854, "learning_rate": 1.6836177922632918e-05, "loss": 0.0505, "reward": 1.0250000178813934, "reward_std": 0.054784043319523336, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9875000059604645, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 493.8479278564453, "epoch": 0.33477356377020323, "grad_norm": 0.09057088941335678, "kl": 0.549952282756567, "learning_rate": 1.6828016911465655e-05, "loss": 0.0907, "reward": 1.0505208492279052, "reward_std": 0.08606277983635664, "rewards/accuracy_reward": 0.07083333432674407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 467.7375091552734, "epoch": 0.33509361497839657, "grad_norm": 0.34637758135795593, "kl": 0.7677448585629463, "learning_rate": 1.6819847371799505e-05, "loss": 0.1089, "reward": 0.9848958671092987, "reward_std": 0.11421102657914162, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958551883698, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 469.1625091552734, "epoch": 0.33541366618658985, "grad_norm": 0.15661399066448212, "kl": 0.47885870188474655, "learning_rate": 1.681166931383859e-05, "loss": 0.0583, "reward": 1.056250023841858, "reward_std": 0.0759154099971056, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 462.1291778564453, "epoch": 0.33573371739478314, "grad_norm": 0.16242820024490356, "kl": 0.251812618970871, "learning_rate": 1.6803482747797674e-05, "loss": 0.0133, "reward": 1.0750000119209289, "reward_std": 0.08946933038532734, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9875000178813934, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 471.3854248046875, "epoch": 0.3360537686029765, "grad_norm": 0.23674461245536804, "kl": 0.4152077123522758, "learning_rate": 1.6795287683902136e-05, "loss": 0.0852, "reward": 1.0583333492279052, "reward_std": 0.11216433495283126, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 486.94793395996095, "epoch": 0.33637381981116976, "grad_norm": 0.15628622472286224, "kl": 0.33315576761960985, "learning_rate": 1.6787084132387987e-05, "loss": 0.0482, "reward": 1.0989583432674408, "reward_std": 0.12317422851920128, "rewards/accuracy_reward": 0.11458333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 485.56251525878906, "epoch": 0.3366938710193631, "grad_norm": 0.21389330923557281, "kl": 0.5460788942873478, "learning_rate": 1.6778872103501825e-05, "loss": 0.051, "reward": 1.021875011920929, "reward_std": 0.1267334796488285, "rewards/accuracy_reward": 0.04583333376795053, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 486.83335266113284, "epoch": 0.3370139222275564, "grad_norm": 0.1881573349237442, "kl": 0.36430116593837736, "learning_rate": 1.677065160750084e-05, "loss": 0.0722, "reward": 1.062500011920929, "reward_std": 0.10569052752107382, "rewards/accuracy_reward": 0.08333333432674409, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 504.9583465576172, "epoch": 0.3373339734357497, "grad_norm": 0.1714252233505249, "kl": 0.5596703916788102, "learning_rate": 1.6762422654652806e-05, "loss": 0.1102, "reward": 1.0255208611488342, "reward_std": 0.11673090867698192, "rewards/accuracy_reward": 0.050000001676380634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 519.2104339599609, "epoch": 0.337654024643943, "grad_norm": 0.13085860013961792, "kl": 0.6136700950562954, "learning_rate": 1.6754185255236047e-05, "loss": 0.0916, "reward": 1.0729166984558105, "reward_std": 0.1413856975734234, "rewards/accuracy_reward": 0.09791667107492685, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 514.6895965576172, "epoch": 0.33797407585213635, "grad_norm": 0.16582679748535156, "kl": 0.6665895022451878, "learning_rate": 1.674593941953945e-05, "loss": 0.0631, "reward": 1.053645873069763, "reward_std": 0.1587853878736496, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 507.177099609375, "epoch": 0.33829412706032963, "grad_norm": 0.09304836392402649, "kl": 0.4092469088733196, "learning_rate": 1.6737685157862428e-05, "loss": 0.072, "reward": 1.1234375417232514, "reward_std": 0.1315523639321327, "rewards/accuracy_reward": 0.13958333767950534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 555.7229370117187, "epoch": 0.33861417826852297, "grad_norm": 0.35923483967781067, "kl": 0.9676612123847008, "learning_rate": 1.6729422480514926e-05, "loss": 0.074, "reward": 1.0307291865348815, "reward_std": 0.12929603606462478, "rewards/accuracy_reward": 0.05625000018626451, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9723958492279052, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 516.5208587646484, "epoch": 0.33893422947671625, "grad_norm": 0.20335394144058228, "kl": 0.747900664061308, "learning_rate": 1.67211513978174e-05, "loss": 0.1232, "reward": 1.0276041984558106, "reward_std": 0.15231779962778091, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 491.5520965576172, "epoch": 0.3392542806849096, "grad_norm": 0.23044894635677338, "kl": 0.5531878419220447, "learning_rate": 1.6712871920100796e-05, "loss": 0.1287, "reward": 1.0656250238418579, "reward_std": 0.11024533435702324, "rewards/accuracy_reward": 0.09166667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583373069763, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 497.895849609375, "epoch": 0.3395743318931029, "grad_norm": 0.13916930556297302, "kl": 0.6994629740715027, "learning_rate": 1.6704584057706558e-05, "loss": 0.1484, "reward": 1.0302083551883698, "reward_std": 0.14814634323120118, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416805744171, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 525.333349609375, "epoch": 0.3398943831012962, "grad_norm": 0.06657827645540237, "kl": 0.26350629031658174, "learning_rate": 1.6696287820986595e-05, "loss": 0.0732, "reward": 0.9911458373069764, "reward_std": 0.06581217646598816, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791686534881, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 506.0541748046875, "epoch": 0.3402144343094895, "grad_norm": 0.14629337191581726, "kl": 0.2807926818728447, "learning_rate": 1.668798322030328e-05, "loss": 0.0584, "reward": 1.0437500119209289, "reward_std": 0.06695888042449952, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 512.6021057128906, "epoch": 0.34053448551768284, "grad_norm": 0.22216708958148956, "kl": 0.6122836649417878, "learning_rate": 1.667967026602943e-05, "loss": 0.0986, "reward": 1.0703125238418578, "reward_std": 0.1326361045241356, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 509.1354248046875, "epoch": 0.3408545367258761, "grad_norm": 0.13464389741420746, "kl": 0.6803478240966797, "learning_rate": 1.66713489685483e-05, "loss": 0.1292, "reward": 1.0578125357627868, "reward_std": 0.1410712368786335, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 517.6291870117187, "epoch": 0.34117458793406946, "grad_norm": 0.12916676700115204, "kl": 0.5221568688750267, "learning_rate": 1.6663019338253556e-05, "loss": 0.104, "reward": 1.0270833432674409, "reward_std": 0.12490638475865126, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 511.689599609375, "epoch": 0.34149463914226275, "grad_norm": 0.21543385088443756, "kl": 0.8520060390233993, "learning_rate": 1.665468138554929e-05, "loss": 0.2002, "reward": 0.979166692495346, "reward_std": 0.16948885917663575, "rewards/accuracy_reward": 0.02291666679084301, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9541666805744171, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 526.2770935058594, "epoch": 0.3418146903504561, "grad_norm": 0.26105642318725586, "kl": 0.8584218144416809, "learning_rate": 1.6646335120849964e-05, "loss": 0.2162, "reward": 1.0072916924953461, "reward_std": 0.20712767243385316, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9510416924953461, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 546.4416809082031, "epoch": 0.34213474155864937, "grad_norm": 0.24653339385986328, "kl": 1.068970836699009, "learning_rate": 1.6637980554580447e-05, "loss": 0.1043, "reward": 1.1468750298023225, "reward_std": 0.1717896606773138, "rewards/accuracy_reward": 0.18958334028720855, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916865348816, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 503.23126220703125, "epoch": 0.3424547927668427, "grad_norm": 0.1497165560722351, "kl": 1.0819413036108017, "learning_rate": 1.6629617697175967e-05, "loss": 0.1963, "reward": 0.993750023841858, "reward_std": 0.18702587112784386, "rewards/accuracy_reward": 0.03750000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9562500298023224, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 529.4021026611329, "epoch": 0.342774843975036, "grad_norm": 0.6320114731788635, "kl": 1.6799171954393386, "learning_rate": 1.66212465590821e-05, "loss": 0.2058, "reward": 1.105729204416275, "reward_std": 0.1997086688876152, "rewards/accuracy_reward": 0.16458333767950534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9411458492279052, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 532.039599609375, "epoch": 0.34309489518322933, "grad_norm": 0.23983895778656006, "kl": 0.9433726727962494, "learning_rate": 1.6612867150754776e-05, "loss": 0.091, "reward": 1.0604167044162751, "reward_std": 0.1479168005287647, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 530.1604370117187, "epoch": 0.3434149463914226, "grad_norm": 0.19629456102848053, "kl": 0.9360431842505932, "learning_rate": 1.6604479482660257e-05, "loss": 0.1161, "reward": 1.051562523841858, "reward_std": 0.1672593917697668, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 525.5458465576172, "epoch": 0.34373499759961595, "grad_norm": 0.5441807508468628, "kl": 0.7919405251741409, "learning_rate": 1.6596083565275107e-05, "loss": 0.1435, "reward": 0.9848958551883698, "reward_std": 0.14814932085573673, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291865348816, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 518.2146057128906, "epoch": 0.34405504880780924, "grad_norm": 0.20584291219711304, "kl": 0.8357704304158687, "learning_rate": 1.6587679409086207e-05, "loss": 0.1099, "reward": 1.0416666865348816, "reward_std": 0.11115128733217716, "rewards/accuracy_reward": 0.07083333432674407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333492279053, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 521.4625183105469, "epoch": 0.3443751000160026, "grad_norm": 0.2858220636844635, "kl": 0.8899985015392303, "learning_rate": 1.6579267024590727e-05, "loss": 0.1511, "reward": 1.1177083790302276, "reward_std": 0.15269537195563315, "rewards/accuracy_reward": 0.15833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750178813935, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 540.2854278564453, "epoch": 0.34469515122419586, "grad_norm": 0.20057371258735657, "kl": 0.8167131602764129, "learning_rate": 1.6570846422296102e-05, "loss": 0.1817, "reward": 1.0286458492279054, "reward_std": 0.17073360234498977, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791805744172, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 498.61876525878904, "epoch": 0.3450152024323892, "grad_norm": 0.12494718283414841, "kl": 0.4915993630886078, "learning_rate": 1.6562417612720055e-05, "loss": 0.0835, "reward": 1.0744792103767395, "reward_std": 0.14771257862448692, "rewards/accuracy_reward": 0.09375000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 519.5000152587891, "epoch": 0.3453352536405825, "grad_norm": 0.17784957587718964, "kl": 0.9045666679739952, "learning_rate": 1.6553980606390538e-05, "loss": 0.1211, "reward": 1.0593750119209289, "reward_std": 0.13307706415653228, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 489.95001220703125, "epoch": 0.3456553048487758, "grad_norm": 0.09965129941701889, "kl": 0.5666639655828476, "learning_rate": 1.654553541384575e-05, "loss": 0.0682, "reward": 0.9807291805744172, "reward_std": 0.09674109499901533, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 517.4583526611328, "epoch": 0.3459753560569691, "grad_norm": 0.3702761232852936, "kl": 0.6852307498455048, "learning_rate": 1.6537082045634116e-05, "loss": 0.1417, "reward": 1.0458333551883698, "reward_std": 0.09951124414801597, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166746139526, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 466.1000122070312, "epoch": 0.34629540726516245, "grad_norm": 0.29468998312950134, "kl": 0.8192525319755077, "learning_rate": 1.6528620512314276e-05, "loss": 0.1118, "reward": 1.017187523841858, "reward_std": 0.09673091005533933, "rewards/accuracy_reward": 0.039583335444331166, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041746139527, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 528.035433959961, "epoch": 0.34661545847335573, "grad_norm": 0.11237310618162155, "kl": 0.3425339564681053, "learning_rate": 1.652015082445506e-05, "loss": 0.0822, "reward": 1.0802083551883697, "reward_std": 0.0760267723351717, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 493.88958740234375, "epoch": 0.34693550968154907, "grad_norm": 0.1413935422897339, "kl": 0.43408130556344987, "learning_rate": 1.6511672992635478e-05, "loss": 0.1053, "reward": 1.0927083551883698, "reward_std": 0.13995889909565448, "rewards/accuracy_reward": 0.11875000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 496.1083526611328, "epoch": 0.34725556088974235, "grad_norm": 0.09691617637872696, "kl": 0.2840468570590019, "learning_rate": 1.6503187027444737e-05, "loss": 0.0298, "reward": 1.1270833611488342, "reward_std": 0.04991227090358734, "rewards/accuracy_reward": 0.13750000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833373069763, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 518.3021026611328, "epoch": 0.3475756120979357, "grad_norm": 0.04689677432179451, "kl": 0.18280332162976265, "learning_rate": 1.6494692939482183e-05, "loss": 0.0029, "reward": 1.103125023841858, "reward_std": 0.07317595779895783, "rewards/accuracy_reward": 0.10833333935588599, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9947916746139527, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 487.4875183105469, "epoch": 0.347895663306129, "grad_norm": 0.0464547798037529, "kl": 0.22884158343076705, "learning_rate": 1.6486190739357307e-05, "loss": 0.007, "reward": 1.074479192495346, "reward_std": 0.06167098730802536, "rewards/accuracy_reward": 0.08333333432674409, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458551883697, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 482.2875122070312, "epoch": 0.3482157145143223, "grad_norm": 0.17549873888492584, "kl": 0.2067810483276844, "learning_rate": 1.6477680437689746e-05, "loss": 0.0603, "reward": 1.0901041865348815, "reward_std": 0.11889987215399742, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 525.6250183105469, "epoch": 0.3485357657225156, "grad_norm": 0.2971534729003906, "kl": 0.2798635631799698, "learning_rate": 1.646916204510924e-05, "loss": 0.0891, "reward": 1.085416704416275, "reward_std": 0.10783621110022068, "rewards/accuracy_reward": 0.10833333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 501.827099609375, "epoch": 0.34885581693070894, "grad_norm": 0.22730644047260284, "kl": 0.2414296567440033, "learning_rate": 1.6460635572255644e-05, "loss": 0.0829, "reward": 1.0468750298023224, "reward_std": 0.08340628929436207, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 486.25209350585936, "epoch": 0.3491758681389022, "grad_norm": 0.10404152423143387, "kl": 0.45262009650468826, "learning_rate": 1.6452101029778908e-05, "loss": 0.0416, "reward": 1.005729180574417, "reward_std": 0.06869241334497929, "rewards/accuracy_reward": 0.020833333395421504, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958492279053, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 500.92710266113284, "epoch": 0.34949591934709556, "grad_norm": 0.09202590584754944, "kl": 0.250038680434227, "learning_rate": 1.6443558428339054e-05, "loss": 0.0159, "reward": 1.090625035762787, "reward_std": 0.10430398043245077, "rewards/accuracy_reward": 0.0979166703298688, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9927083432674408, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 499.0458465576172, "epoch": 0.34981597055528885, "grad_norm": 0.10142835229635239, "kl": 0.2372105412185192, "learning_rate": 1.6435007778606177e-05, "loss": 0.0508, "reward": 1.0635416984558106, "reward_std": 0.08527100309729577, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990625011920929, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 543.3916839599609, "epoch": 0.35013602176348213, "grad_norm": 0.08181018382310867, "kl": 0.3042118564248085, "learning_rate": 1.6426449091260424e-05, "loss": 0.0624, "reward": 1.0932291924953461, "reward_std": 0.10010075122117996, "rewards/accuracy_reward": 0.10833333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958551883698, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 509.5520965576172, "epoch": 0.35045607297167547, "grad_norm": 0.1259605586528778, "kl": 0.19862473011016846, "learning_rate": 1.641788237699197e-05, "loss": 0.025, "reward": 1.0520833611488343, "reward_std": 0.08347481749951839, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666746139526, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 503.3416809082031, "epoch": 0.35077612417986875, "grad_norm": 0.6125533580780029, "kl": 0.23866599127650262, "learning_rate": 1.6409307646501032e-05, "loss": 0.0605, "reward": 1.0468750178813935, "reward_std": 0.08658724837005138, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 529.6604309082031, "epoch": 0.3510961753880621, "grad_norm": 0.13647539913654327, "kl": 0.5289441749453545, "learning_rate": 1.6400724910497832e-05, "loss": 0.0664, "reward": 1.0973958551883698, "reward_std": 0.15426294282078742, "rewards/accuracy_reward": 0.11666667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291746139526, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 543.6437683105469, "epoch": 0.3514162265962554, "grad_norm": 0.14252297580242157, "kl": 0.3495174624025822, "learning_rate": 1.6392134179702585e-05, "loss": 0.0596, "reward": 1.0697916865348815, "reward_std": 0.07795717976987362, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 516.4708465576172, "epoch": 0.3517362778044487, "grad_norm": 0.12695029377937317, "kl": 0.3373839229345322, "learning_rate": 1.6383535464845507e-05, "loss": 0.0763, "reward": 1.043229192495346, "reward_std": 0.09906813129782677, "rewards/accuracy_reward": 0.0604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125178813935, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 552.395849609375, "epoch": 0.352056329012642, "grad_norm": 0.22602546215057373, "kl": 0.6235632814466954, "learning_rate": 1.637492877666677e-05, "loss": 0.111, "reward": 1.1000000476837157, "reward_std": 0.13195635173469783, "rewards/accuracy_reward": 0.13541666995733975, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833611488343, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 563.6875244140625, "epoch": 0.35237638022083534, "grad_norm": 0.09847158193588257, "kl": 0.35407338961958884, "learning_rate": 1.6366314125916524e-05, "loss": 0.0664, "reward": 1.1432292103767394, "reward_std": 0.09192095547914506, "rewards/accuracy_reward": 0.16458333730697633, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458611488342, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 543.289599609375, "epoch": 0.3526964314290286, "grad_norm": 0.1806415617465973, "kl": 0.5341693744063377, "learning_rate": 1.635769152335484e-05, "loss": 0.113, "reward": 1.051041692495346, "reward_std": 0.12980886101722716, "rewards/accuracy_reward": 0.08125000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 520.8729400634766, "epoch": 0.35301648263722196, "grad_norm": 0.6527080535888672, "kl": 0.81370819658041, "learning_rate": 1.6349060979751744e-05, "loss": 0.144, "reward": 1.083854180574417, "reward_std": 0.14183319211006165, "rewards/accuracy_reward": 0.12083333693444728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208432674408, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 546.8625183105469, "epoch": 0.35333653384541525, "grad_norm": 0.3019215166568756, "kl": 0.7167576387524605, "learning_rate": 1.634042250588717e-05, "loss": 0.1301, "reward": 1.0328125178813934, "reward_std": 0.15510014891624452, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125059604645, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 542.595849609375, "epoch": 0.3536565850536086, "grad_norm": 0.1385013312101364, "kl": 0.6982532098889351, "learning_rate": 1.6331776112550956e-05, "loss": 0.1219, "reward": 1.1036458492279053, "reward_std": 0.21694720312952995, "rewards/accuracy_reward": 0.14583333469927312, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125178813934, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 548.2854248046875, "epoch": 0.35397663626180187, "grad_norm": 0.40079644322395325, "kl": 0.6103353053331375, "learning_rate": 1.6323121810542836e-05, "loss": 0.1437, "reward": 0.965104204416275, "reward_std": 0.1538691446185112, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875298023224, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 564.8979309082031, "epoch": 0.3542966874699952, "grad_norm": 0.2274789661169052, "kl": 0.5727997168898582, "learning_rate": 1.631445961067242e-05, "loss": 0.0982, "reward": 1.0286458611488343, "reward_std": 0.17690156698226928, "rewards/accuracy_reward": 0.07083333432674407, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125059604645, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 565.4812683105469, "epoch": 0.3546167386781885, "grad_norm": 0.45921313762664795, "kl": 0.6686241254210472, "learning_rate": 1.6305789523759186e-05, "loss": 0.0862, "reward": 1.0213542044162751, "reward_std": 0.1808813363313675, "rewards/accuracy_reward": 0.06458333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708432674408, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 554.1604431152343, "epoch": 0.35493678988638183, "grad_norm": 0.1756734997034073, "kl": 1.0812157839536667, "learning_rate": 1.6297111560632456e-05, "loss": 0.1661, "reward": 0.9817708611488343, "reward_std": 0.21475159972906113, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9338541865348816, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 544.1104339599609, "epoch": 0.3552568410945751, "grad_norm": 0.2468792200088501, "kl": 1.1981078289449214, "learning_rate": 1.62884257321314e-05, "loss": 0.1969, "reward": 1.0427083671092987, "reward_std": 0.18149395957589148, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9343750238418579, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 541.9416717529297, "epoch": 0.35557689230276845, "grad_norm": 0.20042645931243896, "kl": 0.9907065749168396, "learning_rate": 1.6279732049105e-05, "loss": 0.1766, "reward": 1.0307291924953461, "reward_std": 0.2287411093711853, "rewards/accuracy_reward": 0.10833333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9223958492279053, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 569.5062683105468, "epoch": 0.35589694351096174, "grad_norm": 0.43150994181632996, "kl": 1.5727191627025605, "learning_rate": 1.6271030522412066e-05, "loss": 0.2304, "reward": 0.9906250178813935, "reward_std": 0.22456872165203096, "rewards/accuracy_reward": 0.06458333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9260416865348816, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 569.5145965576172, "epoch": 0.3562169947191551, "grad_norm": 0.20752328634262085, "kl": 1.127117747068405, "learning_rate": 1.6262321162921186e-05, "loss": 0.2099, "reward": 1.049479180574417, "reward_std": 0.18920135349035264, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9411458432674408, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 554.3437683105469, "epoch": 0.35653704592734836, "grad_norm": 0.20367443561553955, "kl": 1.2074265986680985, "learning_rate": 1.6253603981510742e-05, "loss": 0.1763, "reward": 1.0625000298023224, "reward_std": 0.20715280324220658, "rewards/accuracy_reward": 0.12500000335276126, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9375000178813935, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 528.4020935058594, "epoch": 0.3568570971355417, "grad_norm": 0.2791835367679596, "kl": 0.6976873815059662, "learning_rate": 1.6244878989068884e-05, "loss": 0.1238, "reward": 1.1109375298023223, "reward_std": 0.16525277644395828, "rewards/accuracy_reward": 0.15416667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9567708492279052, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 558.8916839599609, "epoch": 0.357177148343735, "grad_norm": 0.22703680396080017, "kl": 0.7577734768390656, "learning_rate": 1.623614619649352e-05, "loss": 0.1371, "reward": 1.0723958730697631, "reward_std": 0.14903780817985535, "rewards/accuracy_reward": 0.11458333674818277, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125178813934, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 538.4645965576171, "epoch": 0.3574971995519283, "grad_norm": 0.22214283049106598, "kl": 0.6411052107810974, "learning_rate": 1.6227405614692295e-05, "loss": 0.1377, "reward": 1.0541666924953461, "reward_std": 0.17779069989919663, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666805744171, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 535.2000183105469, "epoch": 0.3578172507601216, "grad_norm": 0.25091204047203064, "kl": 1.055853134393692, "learning_rate": 1.621865725458259e-05, "loss": 0.2165, "reward": 1.1354166984558105, "reward_std": 0.19953776970505716, "rewards/accuracy_reward": 0.19166667368263007, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9437500178813935, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 532.5750183105469, "epoch": 0.35813730196831495, "grad_norm": 0.31793567538261414, "kl": 0.9337344884872436, "learning_rate": 1.6209901127091495e-05, "loss": 0.1811, "reward": 1.0005208492279052, "reward_std": 0.14889881759881973, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9546875178813934, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 543.7541809082031, "epoch": 0.35845735317650823, "grad_norm": 0.4377756416797638, "kl": 1.5817268535494804, "learning_rate": 1.6201137243155815e-05, "loss": 0.1998, "reward": 1.032812523841858, "reward_std": 0.18937183991074563, "rewards/accuracy_reward": 0.09583333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9369791865348815, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 512.9750122070312, "epoch": 0.35877740438470157, "grad_norm": 0.24275319278240204, "kl": 0.6483488872647285, "learning_rate": 1.619236561372202e-05, "loss": 0.1876, "reward": 1.1187500298023223, "reward_std": 0.1460374455899, "rewards/accuracy_reward": 0.1604166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333551883697, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 548.0521057128906, "epoch": 0.35909745559289485, "grad_norm": 0.3204553723335266, "kl": 0.9119420304894448, "learning_rate": 1.618358624974628e-05, "loss": 0.1642, "reward": 1.0125000119209289, "reward_std": 0.18264763969928027, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9416666865348816, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 528.004183959961, "epoch": 0.3594175068010882, "grad_norm": 0.39938727021217346, "kl": 0.5670314341783523, "learning_rate": 1.617479916219441e-05, "loss": 0.124, "reward": 1.0885416865348816, "reward_std": 0.20538848787546157, "rewards/accuracy_reward": 0.12083333786576986, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083432674408, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 539.3812744140625, "epoch": 0.3597375580092815, "grad_norm": 0.11475303769111633, "kl": 0.294252347946167, "learning_rate": 1.6166004362041867e-05, "loss": 0.0812, "reward": 1.0109375298023224, "reward_std": 0.12263874225318432, "rewards/accuracy_reward": 0.03333333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 510.6312622070312, "epoch": 0.3600576092174748, "grad_norm": 0.27126359939575195, "kl": 0.5437161371111869, "learning_rate": 1.6157201860273764e-05, "loss": 0.1592, "reward": 1.0229166924953461, "reward_std": 0.15174967646598816, "rewards/accuracy_reward": 0.06041666697710753, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000178813934, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 518.8375122070313, "epoch": 0.3603776604256681, "grad_norm": 0.27678969502449036, "kl": 0.3689132109284401, "learning_rate": 1.614839166788481e-05, "loss": 0.1059, "reward": 1.0052083432674408, "reward_std": 0.1078458171337843, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 515.1708343505859, "epoch": 0.36069771163386144, "grad_norm": 0.28149381279945374, "kl": 0.5752302646636963, "learning_rate": 1.6139573795879337e-05, "loss": 0.1607, "reward": 0.9807291805744172, "reward_std": 0.13525601997971534, "rewards/accuracy_reward": 0.01458333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 504.80418090820314, "epoch": 0.3610177628420547, "grad_norm": 0.18258237838745117, "kl": 0.4668318539857864, "learning_rate": 1.6130748255271257e-05, "loss": 0.1452, "reward": 1.0510416805744172, "reward_std": 0.12038996331393718, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 499.2229278564453, "epoch": 0.36133781405024806, "grad_norm": 0.499051570892334, "kl": 0.7709709912538528, "learning_rate": 1.6121915057084064e-05, "loss": 0.1169, "reward": 1.0291666984558105, "reward_std": 0.15609467439353467, "rewards/accuracy_reward": 0.05416666753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 517.5145904541016, "epoch": 0.36165786525844135, "grad_norm": 0.2296091914176941, "kl": 0.5238408371806145, "learning_rate": 1.6113074212350827e-05, "loss": 0.155, "reward": 1.0187500238418579, "reward_std": 0.13111219555139542, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 473.2791778564453, "epoch": 0.3619779164666347, "grad_norm": 0.20802690088748932, "kl": 0.6877270132303238, "learning_rate": 1.6104225732114143e-05, "loss": 0.1275, "reward": 1.0468750178813935, "reward_std": 0.11990614160895348, "rewards/accuracy_reward": 0.07083333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416686534882, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 500.7354309082031, "epoch": 0.36229796767482797, "grad_norm": 0.3290000259876251, "kl": 0.7059184789657593, "learning_rate": 1.609536962742617e-05, "loss": 0.1368, "reward": 1.0656250357627868, "reward_std": 0.1308181770145893, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250178813934, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 462.877099609375, "epoch": 0.3626180188830213, "grad_norm": 0.21400968730449677, "kl": 0.5877170011401176, "learning_rate": 1.6086505909348585e-05, "loss": 0.1876, "reward": 1.1864583671092988, "reward_std": 0.22381700724363326, "rewards/accuracy_reward": 0.22916667275130748, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916805744172, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 477.80417785644534, "epoch": 0.3629380700912146, "grad_norm": 0.11039919406175613, "kl": 0.2935449294745922, "learning_rate": 1.6077634588952552e-05, "loss": 0.0748, "reward": 1.1015625476837159, "reward_std": 0.1574238944798708, "rewards/accuracy_reward": 0.11666666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 489.3020965576172, "epoch": 0.36325812129940793, "grad_norm": 0.2583947479724884, "kl": 0.534557220339775, "learning_rate": 1.606875567731876e-05, "loss": 0.0936, "reward": 1.0109375238418579, "reward_std": 0.11386194564402104, "rewards/accuracy_reward": 0.0312500013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875178813934, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 470.9104278564453, "epoch": 0.3635781725076012, "grad_norm": 0.2370891273021698, "kl": 0.4823115229606628, "learning_rate": 1.6059869185537363e-05, "loss": 0.0808, "reward": 1.0354166865348815, "reward_std": 0.09459959566593171, "rewards/accuracy_reward": 0.05000000055879354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166686534882, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 480.7895904541016, "epoch": 0.3638982237157945, "grad_norm": 0.3485714793205261, "kl": 0.479145385324955, "learning_rate": 1.605097512470799e-05, "loss": 0.1299, "reward": 1.0880208551883697, "reward_std": 0.150136499106884, "rewards/accuracy_reward": 0.12083333674818278, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875059604644, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 516.0750122070312, "epoch": 0.36421827492398784, "grad_norm": 0.35083115100860596, "kl": 0.46517665684223175, "learning_rate": 1.6042073505939718e-05, "loss": 0.1314, "reward": 1.0151041924953461, "reward_std": 0.1264990646392107, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541805744171, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 510.9229400634766, "epoch": 0.3645383261321811, "grad_norm": 0.18946900963783264, "kl": 0.6526028856635093, "learning_rate": 1.6033164340351065e-05, "loss": 0.0952, "reward": 1.096875011920929, "reward_std": 0.10533001609146594, "rewards/accuracy_reward": 0.12291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 552.2708526611328, "epoch": 0.36485837734037446, "grad_norm": 0.12671904265880585, "kl": 0.6638071507215499, "learning_rate": 1.6024247639069987e-05, "loss": 0.1385, "reward": 1.0093750178813934, "reward_std": 0.15015630498528482, "rewards/accuracy_reward": 0.04166666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 505.11668090820314, "epoch": 0.36517842854856775, "grad_norm": 0.4702635407447815, "kl": 1.0072061479091645, "learning_rate": 1.6015323413233838e-05, "loss": 0.1401, "reward": 1.076562523841858, "reward_std": 0.1350021906197071, "rewards/accuracy_reward": 0.10833333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 511.72918090820315, "epoch": 0.3654984797567611, "grad_norm": 0.2643105089664459, "kl": 0.6117392227053642, "learning_rate": 1.6006391673989373e-05, "loss": 0.1544, "reward": 1.0864583671092987, "reward_std": 0.16201163977384567, "rewards/accuracy_reward": 0.11041667200624943, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 500.4666778564453, "epoch": 0.36581853096495437, "grad_norm": 0.2908649742603302, "kl": 0.517003245651722, "learning_rate": 1.5997452432492732e-05, "loss": 0.0798, "reward": 1.020312523841858, "reward_std": 0.12842915281653405, "rewards/accuracy_reward": 0.037500002048909666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 495.1750152587891, "epoch": 0.3661385821731477, "grad_norm": 0.1781575083732605, "kl": 0.5948102369904518, "learning_rate": 1.598850569990944e-05, "loss": 0.1188, "reward": 1.0609375298023225, "reward_std": 0.10798088498413563, "rewards/accuracy_reward": 0.08125, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875238418579, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 513.6854370117187, "epoch": 0.366458633381341, "grad_norm": 0.2938849627971649, "kl": 0.38231213241815565, "learning_rate": 1.5979551487414357e-05, "loss": 0.0857, "reward": 1.0234375238418578, "reward_std": 0.08541666828095913, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 493.327099609375, "epoch": 0.36677868458953433, "grad_norm": 0.12273690849542618, "kl": 0.26505750194191935, "learning_rate": 1.5970589806191698e-05, "loss": 0.0526, "reward": 1.0729166746139527, "reward_std": 0.09640407245606183, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666686534882, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 511.9125183105469, "epoch": 0.3670987357977276, "grad_norm": 0.19048544764518738, "kl": 0.5402480706572532, "learning_rate": 1.5961620667434997e-05, "loss": 0.0985, "reward": 1.0666666865348815, "reward_std": 0.08264825325459242, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 518.0104309082031, "epoch": 0.36741878700592095, "grad_norm": 0.320730596780777, "kl": 0.39287843108177184, "learning_rate": 1.5952644082347124e-05, "loss": 0.0834, "reward": 1.064062523841858, "reward_std": 0.14702175408601761, "rewards/accuracy_reward": 0.08750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 493.71876831054686, "epoch": 0.36773883821411424, "grad_norm": 0.11714158952236176, "kl": 0.2555687852203846, "learning_rate": 1.5943660062140226e-05, "loss": 0.0438, "reward": 1.1031250298023223, "reward_std": 0.11189155112951994, "rewards/accuracy_reward": 0.1208333371207118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916865348816, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 509.1354248046875, "epoch": 0.3680588894223076, "grad_norm": 0.22005389630794525, "kl": 0.2881615623831749, "learning_rate": 1.593466861803575e-05, "loss": 0.1066, "reward": 1.0901041865348815, "reward_std": 0.11322282254695892, "rewards/accuracy_reward": 0.11041666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875059604645, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 491.908349609375, "epoch": 0.36837894063050086, "grad_norm": 0.5133959650993347, "kl": 0.6626154512166977, "learning_rate": 1.592566976126441e-05, "loss": 0.1367, "reward": 1.1182291805744171, "reward_std": 0.13968872725963594, "rewards/accuracy_reward": 0.14375000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 523.1250122070312, "epoch": 0.3686989918386942, "grad_norm": 0.17595118284225464, "kl": 0.43243874460458753, "learning_rate": 1.5916663503066184e-05, "loss": 0.087, "reward": 1.0171875178813934, "reward_std": 0.05748256333172321, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 534.4770965576172, "epoch": 0.3690190430468875, "grad_norm": 0.07913016527891159, "kl": 0.2869682595133781, "learning_rate": 1.5907649854690292e-05, "loss": 0.0644, "reward": 1.0677083611488343, "reward_std": 0.08603444769978523, "rewards/accuracy_reward": 0.08333333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 549.520849609375, "epoch": 0.3693390942550808, "grad_norm": 0.2064250111579895, "kl": 0.4336912453174591, "learning_rate": 1.5898628827395177e-05, "loss": 0.1009, "reward": 1.0192708551883698, "reward_std": 0.09977535083889962, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208492279053, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 565.8770965576172, "epoch": 0.3696591454632741, "grad_norm": 0.19949369132518768, "kl": 0.3324578292667866, "learning_rate": 1.5889600432448515e-05, "loss": 0.0774, "reward": 1.1135416746139526, "reward_std": 0.08293756693601609, "rewards/accuracy_reward": 0.12500000204890965, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416686534881, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 536.8979339599609, "epoch": 0.36997919667146745, "grad_norm": 0.6426315307617188, "kl": 0.7445389926433563, "learning_rate": 1.5880564681127172e-05, "loss": 0.1254, "reward": 1.0541666924953461, "reward_std": 0.12224040143191814, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333492279053, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 536.3125122070312, "epoch": 0.37029924787966073, "grad_norm": 0.2057666778564453, "kl": 0.45236001163721085, "learning_rate": 1.5871521584717207e-05, "loss": 0.1005, "reward": 1.103125023841858, "reward_std": 0.10306334141641856, "rewards/accuracy_reward": 0.12500000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 532.7187683105469, "epoch": 0.37061929908785407, "grad_norm": 0.3101314902305603, "kl": 0.7116847023367882, "learning_rate": 1.5862471154513853e-05, "loss": 0.132, "reward": 1.0463541865348815, "reward_std": 0.12737105637788773, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708492279053, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 534.9916809082031, "epoch": 0.37093935029604735, "grad_norm": 0.30432990193367004, "kl": 0.6676271669566631, "learning_rate": 1.58534134018215e-05, "loss": 0.102, "reward": 0.9994791865348815, "reward_std": 0.10984005965292454, "rewards/accuracy_reward": 0.02916666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 543.6187713623046, "epoch": 0.3712594015042407, "grad_norm": 0.29318493604660034, "kl": 0.40648306608200074, "learning_rate": 1.5844348337953682e-05, "loss": 0.1287, "reward": 0.9895833551883697, "reward_std": 0.15547448098659516, "rewards/accuracy_reward": 0.022916667349636554, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666746139526, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 543.220849609375, "epoch": 0.371579452712434, "grad_norm": 0.1845497488975525, "kl": 0.5116272836923599, "learning_rate": 1.5835275974233083e-05, "loss": 0.0779, "reward": 1.0260416924953462, "reward_std": 0.13704813569784163, "rewards/accuracy_reward": 0.05416666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750238418579, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 557.5021057128906, "epoch": 0.3718995039206273, "grad_norm": 0.34543439745903015, "kl": 0.5618045464158058, "learning_rate": 1.5826196321991484e-05, "loss": 0.1316, "reward": 1.0187500298023224, "reward_std": 0.14411462992429733, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666805744171, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 560.439599609375, "epoch": 0.3722195551288206, "grad_norm": 0.27085787057876587, "kl": 0.4931087389588356, "learning_rate": 1.581710939256978e-05, "loss": 0.0706, "reward": 1.1380208611488343, "reward_std": 0.16875488683581352, "rewards/accuracy_reward": 0.16250000484287738, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208492279053, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 541.0229370117188, "epoch": 0.37253960633701394, "grad_norm": 0.24295267462730408, "kl": 0.6016337320208549, "learning_rate": 1.5808015197317944e-05, "loss": 0.073, "reward": 1.040625023841858, "reward_std": 0.12459696829319, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 528.7375091552734, "epoch": 0.3728596575452072, "grad_norm": 0.46193087100982666, "kl": 0.49578318893909457, "learning_rate": 1.5798913747595038e-05, "loss": 0.1141, "reward": 1.030208373069763, "reward_std": 0.13383289575576782, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 535.8583465576172, "epoch": 0.37317970875340056, "grad_norm": 0.24223735928535461, "kl": 0.7255073443055153, "learning_rate": 1.5789805054769187e-05, "loss": 0.1319, "reward": 1.067187535762787, "reward_std": 0.17425041720271112, "rewards/accuracy_reward": 0.10833333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541924953461, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 521.708349609375, "epoch": 0.37349975996159385, "grad_norm": 0.2238290160894394, "kl": 0.6970250770449639, "learning_rate": 1.578068913021755e-05, "loss": 0.1519, "reward": 1.0447916984558105, "reward_std": 0.13058854918926954, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 564.5500183105469, "epoch": 0.3738198111697872, "grad_norm": 0.20912905037403107, "kl": 0.735250449180603, "learning_rate": 1.5771565985326323e-05, "loss": 0.1231, "reward": 1.0281250298023223, "reward_std": 0.1537714421749115, "rewards/accuracy_reward": 0.06666666846722365, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583432674408, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 537.535433959961, "epoch": 0.37413986237798047, "grad_norm": 0.40195563435554504, "kl": 0.7344786658883095, "learning_rate": 1.5762435631490732e-05, "loss": 0.1266, "reward": 1.0489583671092988, "reward_std": 0.14664312303066254, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 534.6916809082031, "epoch": 0.3744599135861738, "grad_norm": 0.37209442257881165, "kl": 0.925914877653122, "learning_rate": 1.5753298080114983e-05, "loss": 0.1411, "reward": 0.9750000238418579, "reward_std": 0.14015717357397078, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000178813934, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 552.7041931152344, "epoch": 0.3747799647943671, "grad_norm": 0.3376603424549103, "kl": 0.7903919830918312, "learning_rate": 1.57441533426123e-05, "loss": 0.1121, "reward": 1.0151041865348815, "reward_std": 0.12867529951035978, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.967187511920929, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 545.6396057128907, "epoch": 0.37510001600256043, "grad_norm": 0.15526770055294037, "kl": 0.54666518419981, "learning_rate": 1.5735001430404864e-05, "loss": 0.0911, "reward": 1.0822916805744172, "reward_std": 0.13302346915006638, "rewards/accuracy_reward": 0.10833333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 561.8000183105469, "epoch": 0.3754200672107537, "grad_norm": 0.22396454215049744, "kl": 0.6107922196388245, "learning_rate": 1.5725842354923823e-05, "loss": 0.1063, "reward": 1.0338541924953462, "reward_std": 0.12921084687113762, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 521.3708526611329, "epoch": 0.37574011841894706, "grad_norm": 0.26841455698013306, "kl": 0.4723584517836571, "learning_rate": 1.5716676127609277e-05, "loss": 0.0914, "reward": 1.1182291984558106, "reward_std": 0.15276648811995983, "rewards/accuracy_reward": 0.1437500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791746139526, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 541.8854339599609, "epoch": 0.37606016962714034, "grad_norm": 0.2657756507396698, "kl": 0.5366487547755241, "learning_rate": 1.5707502759910246e-05, "loss": 0.0807, "reward": 1.0348958551883698, "reward_std": 0.15992402881383896, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125059604645, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 523.4541809082032, "epoch": 0.3763802208353337, "grad_norm": 0.13122889399528503, "kl": 0.5268323123455048, "learning_rate": 1.5698322263284683e-05, "loss": 0.1078, "reward": 1.0385416924953461, "reward_std": 0.10363166444003583, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 530.8583557128907, "epoch": 0.37670027204352696, "grad_norm": 0.24503201246261597, "kl": 0.5733169555664063, "learning_rate": 1.568913464919944e-05, "loss": 0.1181, "reward": 1.041666704416275, "reward_std": 0.1343327358365059, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666805744171, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 505.1437652587891, "epoch": 0.3770203232517203, "grad_norm": 0.1513802856206894, "kl": 0.3681658856570721, "learning_rate": 1.5679939929130256e-05, "loss": 0.074, "reward": 1.126562511920929, "reward_std": 0.11795764788985252, "rewards/accuracy_reward": 0.14583333656191827, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291746139526, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 551.7875183105468, "epoch": 0.3773403744599136, "grad_norm": 0.12650415301322937, "kl": 0.48660945147275925, "learning_rate": 1.5670738114561744e-05, "loss": 0.1086, "reward": 1.0734375298023224, "reward_std": 0.163792784512043, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041746139526, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 558.8083465576171, "epoch": 0.37766042566810687, "grad_norm": 0.21940185129642487, "kl": 0.8333341613411903, "learning_rate": 1.5661529216987393e-05, "loss": 0.186, "reward": 0.9989583551883697, "reward_std": 0.16733265221118926, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9427083492279053, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 567.0000152587891, "epoch": 0.3779804768763002, "grad_norm": 0.47936850786209106, "kl": 1.1114118099212646, "learning_rate": 1.565231324790952e-05, "loss": 0.1621, "reward": 1.0494791984558105, "reward_std": 0.21776344440877438, "rewards/accuracy_reward": 0.10625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9432291865348816, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 543.5354370117187, "epoch": 0.3783005280844935, "grad_norm": 0.20487003028392792, "kl": 0.8265619874000549, "learning_rate": 1.564309021883929e-05, "loss": 0.1495, "reward": 1.0140625298023225, "reward_std": 0.16638899594545364, "rewards/accuracy_reward": 0.06458333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791865348816, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 535.5479339599609, "epoch": 0.37862057929268683, "grad_norm": 0.2059021294116974, "kl": 0.8217375859618187, "learning_rate": 1.563386014129667e-05, "loss": 0.2063, "reward": 1.097916692495346, "reward_std": 0.17657624781131745, "rewards/accuracy_reward": 0.14375000353902578, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9520833492279053, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 570.3583526611328, "epoch": 0.3789406305008801, "grad_norm": 0.36007773876190186, "kl": 1.1445650905370712, "learning_rate": 1.5624623026810445e-05, "loss": 0.2036, "reward": 1.0109375298023224, "reward_std": 0.19519389644265175, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9338541924953461, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 578.383349609375, "epoch": 0.37926068170907346, "grad_norm": 0.13779407739639282, "kl": 0.8353869661688804, "learning_rate": 1.5615378886918183e-05, "loss": 0.1376, "reward": 1.0270833611488341, "reward_std": 0.20012224316596985, "rewards/accuracy_reward": 0.08541666846722365, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9416666865348816, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 522.8979370117188, "epoch": 0.37958073291726674, "grad_norm": 0.2753371298313141, "kl": 0.9140992000699043, "learning_rate": 1.5606127733166237e-05, "loss": 0.1652, "reward": 1.0192708432674409, "reward_std": 0.1692800521850586, "rewards/accuracy_reward": 0.06875000018626451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9505208432674408, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 559.2021057128907, "epoch": 0.3799007841254601, "grad_norm": 0.18957631289958954, "kl": 1.0052045956254005, "learning_rate": 1.5596869577109705e-05, "loss": 0.1419, "reward": 1.026562511920929, "reward_std": 0.18854286577552556, "rewards/accuracy_reward": 0.08750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9390625119209289, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 560.2250183105468, "epoch": 0.38022083533365336, "grad_norm": 0.14077381789684296, "kl": 0.771829554438591, "learning_rate": 1.5587604430312436e-05, "loss": 0.1185, "reward": 1.0666666865348815, "reward_std": 0.19701256975531578, "rewards/accuracy_reward": 0.1104166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.956250011920929, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 535.6041870117188, "epoch": 0.3805408865418467, "grad_norm": 0.20995795726776123, "kl": 0.8531022161245346, "learning_rate": 1.5578332304347016e-05, "loss": 0.1283, "reward": 1.0380208551883698, "reward_std": 0.17556421980261802, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541805744171, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 549.9125183105468, "epoch": 0.38086093775004, "grad_norm": 0.23752635717391968, "kl": 0.9788353681564331, "learning_rate": 1.5569053210794748e-05, "loss": 0.095, "reward": 1.020312523841858, "reward_std": 0.13795197159051895, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791805744172, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 559.4187652587891, "epoch": 0.3811809889582333, "grad_norm": 0.3365929424762726, "kl": 1.0373542308807373, "learning_rate": 1.5559767161245633e-05, "loss": 0.1289, "reward": 1.0098958671092988, "reward_std": 0.2035977765917778, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9411458551883698, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 546.7958557128907, "epoch": 0.3815010401664266, "grad_norm": 0.13403218984603882, "kl": 0.6618030473589898, "learning_rate": 1.5550474167298364e-05, "loss": 0.0804, "reward": 1.1020833611488343, "reward_std": 0.1049573190510273, "rewards/accuracy_reward": 0.1375000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 515.7479431152344, "epoch": 0.38182109137461995, "grad_norm": 0.16500230133533478, "kl": 0.7753990903496742, "learning_rate": 1.5541174240560303e-05, "loss": 0.1341, "reward": 1.091666692495346, "reward_std": 0.1889185607433319, "rewards/accuracy_reward": 0.13333333544433118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 561.5000183105469, "epoch": 0.38214114258281323, "grad_norm": 0.2911393344402313, "kl": 0.9876785755157471, "learning_rate": 1.553186739264748e-05, "loss": 0.1403, "reward": 1.0031250238418579, "reward_std": 0.17999138236045836, "rewards/accuracy_reward": 0.05833333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9447916805744171, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 537.6625213623047, "epoch": 0.38246119379100657, "grad_norm": 0.4007449448108673, "kl": 1.0350771889090538, "learning_rate": 1.5522553635184567e-05, "loss": 0.1465, "reward": 1.007812535762787, "reward_std": 0.18713185042142869, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9494791805744172, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 572.0166839599609, "epoch": 0.38278124499919985, "grad_norm": 0.10396149754524231, "kl": 0.5359139025211335, "learning_rate": 1.5513232979804854e-05, "loss": 0.0613, "reward": 1.0031250238418579, "reward_std": 0.1407000742852688, "rewards/accuracy_reward": 0.04166666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583432674408, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 523.1437683105469, "epoch": 0.3831012962073932, "grad_norm": 0.17239876091480255, "kl": 0.6517708688974381, "learning_rate": 1.550390543815026e-05, "loss": 0.1449, "reward": 1.0859375298023224, "reward_std": 0.17106188386678695, "rewards/accuracy_reward": 0.11666667181998491, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.967187511920929, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 550.6750244140625, "epoch": 0.3834213474155865, "grad_norm": 0.24555876851081848, "kl": 0.49216202795505526, "learning_rate": 1.549457102187131e-05, "loss": 0.1128, "reward": 1.1161458969116211, "reward_std": 0.2306416004896164, "rewards/accuracy_reward": 0.15208333656191825, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9619791865348816, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 555.770849609375, "epoch": 0.3837413986237798, "grad_norm": 0.1278693675994873, "kl": 0.5300887562334538, "learning_rate": 1.5485229742627102e-05, "loss": 0.0528, "reward": 1.1583333611488342, "reward_std": 0.15799203217029573, "rewards/accuracy_reward": 0.1895833395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500178813935, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 519.7000244140625, "epoch": 0.3840614498319731, "grad_norm": 0.36602315306663513, "kl": 0.49914331585168836, "learning_rate": 1.5475881612085313e-05, "loss": 0.1042, "reward": 1.0505208730697633, "reward_std": 0.17699409797787666, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708551883698, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 508.54169006347655, "epoch": 0.38438150104016644, "grad_norm": 0.17335864901542664, "kl": 0.46766447871923444, "learning_rate": 1.5466526641922174e-05, "loss": 0.077, "reward": 1.032291704416275, "reward_std": 0.13878519721329213, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 542.2083435058594, "epoch": 0.3847015522483597, "grad_norm": 0.12205848842859268, "kl": 0.32414179742336274, "learning_rate": 1.5457164843822465e-05, "loss": 0.0973, "reward": 1.1067708551883697, "reward_std": 0.11873383224010467, "rewards/accuracy_reward": 0.13541667275130748, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 544.014599609375, "epoch": 0.38502160345655306, "grad_norm": 0.11642692983150482, "kl": 0.6380541652441025, "learning_rate": 1.5447796229479495e-05, "loss": 0.1153, "reward": 1.0760416984558105, "reward_std": 0.1689482469111681, "rewards/accuracy_reward": 0.11458333730697631, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583551883698, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 539.6271026611328, "epoch": 0.38534165466474635, "grad_norm": 0.12941080331802368, "kl": 0.30585852190852164, "learning_rate": 1.5438420810595073e-05, "loss": 0.0648, "reward": 1.0114583611488341, "reward_std": 0.10651846360415221, "rewards/accuracy_reward": 0.029166668094694613, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 526.935433959961, "epoch": 0.3856617058729397, "grad_norm": 0.3982611298561096, "kl": 0.6207052066922187, "learning_rate": 1.5429038598879526e-05, "loss": 0.0918, "reward": 1.0307291924953461, "reward_std": 0.15194091200828552, "rewards/accuracy_reward": 0.06041666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 514.6979339599609, "epoch": 0.38598175708113297, "grad_norm": 0.06387288123369217, "kl": 0.30112158581614495, "learning_rate": 1.5419649606051648e-05, "loss": 0.0393, "reward": 1.079687523841858, "reward_std": 0.09635667633265257, "rewards/accuracy_reward": 0.09583333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 541.0291778564454, "epoch": 0.3863018082893263, "grad_norm": 0.2889772057533264, "kl": 0.6139212623238564, "learning_rate": 1.5410253843838717e-05, "loss": 0.0946, "reward": 1.0229166924953461, "reward_std": 0.10145474877208471, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 560.7708618164063, "epoch": 0.3866218594975196, "grad_norm": 0.22425009310245514, "kl": 0.39511779621243476, "learning_rate": 1.540085132397646e-05, "loss": 0.0991, "reward": 1.110416704416275, "reward_std": 0.12801450863480568, "rewards/accuracy_reward": 0.13333333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 546.0166870117188, "epoch": 0.38694191070571293, "grad_norm": 0.16940173506736755, "kl": 0.48901860415935516, "learning_rate": 1.539144205820905e-05, "loss": 0.1193, "reward": 1.0723958790302277, "reward_std": 0.18953076004981995, "rewards/accuracy_reward": 0.11041666809469461, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.961979192495346, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 602.7062713623047, "epoch": 0.3872619619139062, "grad_norm": 0.24883034825325012, "kl": 0.6189972922205925, "learning_rate": 1.538202605828907e-05, "loss": 0.1052, "reward": 0.9833333611488342, "reward_std": 0.15337586775422096, "rewards/accuracy_reward": 0.02500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333551883697, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 636.789599609375, "epoch": 0.38758201312209956, "grad_norm": 0.5786344408988953, "kl": 0.9378010019659996, "learning_rate": 1.5372603335977537e-05, "loss": 0.1549, "reward": 0.9421875298023223, "reward_std": 0.22690111324191092, "rewards/accuracy_reward": 0.027083333395421506, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9109375178813934, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 606.358349609375, "epoch": 0.38790206433029284, "grad_norm": 0.17479492723941803, "kl": 0.7791576758027077, "learning_rate": 1.536317390304385e-05, "loss": 0.1551, "reward": 0.9604166865348815, "reward_std": 0.22087621092796325, "rewards/accuracy_reward": 0.03541666809469461, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9250000178813934, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 586.2312683105469, "epoch": 0.3882221155384862, "grad_norm": 0.1706581711769104, "kl": 0.5600812263786793, "learning_rate": 1.5353737771265785e-05, "loss": 0.0847, "reward": 0.971354192495346, "reward_std": 0.1764809437096119, "rewards/accuracy_reward": 0.02500000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541865348816, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 611.3479370117187, "epoch": 0.38854216674667946, "grad_norm": 0.12541788816452026, "kl": 0.3706213489174843, "learning_rate": 1.5344294952429506e-05, "loss": 0.0839, "reward": 1.029166692495346, "reward_std": 0.20604985877871512, "rewards/accuracy_reward": 0.0750000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666865348816, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 607.0854431152344, "epoch": 0.3888622179548728, "grad_norm": 0.30533111095428467, "kl": 0.4709593154489994, "learning_rate": 1.5334845458329505e-05, "loss": 0.0993, "reward": 1.1036458492279053, "reward_std": 0.15157056059688329, "rewards/accuracy_reward": 0.15625000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9473958492279053, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 604.7583526611328, "epoch": 0.3891822691630661, "grad_norm": 0.2925158143043518, "kl": 0.510768836736679, "learning_rate": 1.532538930076863e-05, "loss": 0.0869, "reward": 1.0177083611488342, "reward_std": 0.20111262276768685, "rewards/accuracy_reward": 0.06250000055879354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9552083492279053, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 624.5041931152343, "epoch": 0.3895023203712594, "grad_norm": 0.30002668499946594, "kl": 0.3750695250928402, "learning_rate": 1.5315926491558045e-05, "loss": 0.0757, "reward": 1.0395833611488343, "reward_std": 0.18180562406778336, "rewards/accuracy_reward": 0.08125000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 600.264599609375, "epoch": 0.3898223715794527, "grad_norm": 0.22842350602149963, "kl": 0.5847925186157227, "learning_rate": 1.5306457042517218e-05, "loss": 0.0595, "reward": 0.9734375238418579, "reward_std": 0.11920136883854866, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9630208551883698, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 629.6958557128906, "epoch": 0.39014242278764605, "grad_norm": 0.14000339806079865, "kl": 0.4536043472588062, "learning_rate": 1.5296980965473918e-05, "loss": 0.0439, "reward": 1.099479180574417, "reward_std": 0.13088055774569513, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9723958492279052, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 576.4354309082031, "epoch": 0.39046247399583933, "grad_norm": 0.2920055091381073, "kl": 0.48158343955874444, "learning_rate": 1.5287498272264192e-05, "loss": 0.1022, "reward": 1.1161458492279053, "reward_std": 0.1597229868173599, "rewards/accuracy_reward": 0.14791667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291746139526, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 600.8666839599609, "epoch": 0.39078252520403267, "grad_norm": 0.3325548470020294, "kl": 0.7620904207229614, "learning_rate": 1.5278008974732346e-05, "loss": 0.1268, "reward": 1.0489583611488342, "reward_std": 0.2112396091222763, "rewards/accuracy_reward": 0.09166666734963655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916865348816, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 621.5583557128906, "epoch": 0.39110257641222596, "grad_norm": 0.1554885059595108, "kl": 0.44617650359869004, "learning_rate": 1.5268513084730935e-05, "loss": 0.0812, "reward": 1.0286458611488343, "reward_std": 0.1501821421086788, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 604.2125244140625, "epoch": 0.3914226276204193, "grad_norm": 0.12901164591312408, "kl": 0.5715129643678665, "learning_rate": 1.5259010614120755e-05, "loss": 0.1106, "reward": 1.045312523841858, "reward_std": 0.1480789568275213, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 577.1708618164063, "epoch": 0.3917426788286126, "grad_norm": 0.14290817081928253, "kl": 0.5230883605778217, "learning_rate": 1.5249501574770815e-05, "loss": 0.074, "reward": 1.0343750357627868, "reward_std": 0.16299154441803693, "rewards/accuracy_reward": 0.06458333414047956, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 606.3875213623047, "epoch": 0.39206273003680586, "grad_norm": 0.1658349335193634, "kl": 0.3186629630625248, "learning_rate": 1.5239985978558333e-05, "loss": 0.0573, "reward": 1.0692708551883698, "reward_std": 0.10922669228166342, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 581.2208435058594, "epoch": 0.3923827812449992, "grad_norm": 0.09366770088672638, "kl": 0.3164710126817226, "learning_rate": 1.5230463837368713e-05, "loss": 0.0658, "reward": 1.0744791865348815, "reward_std": 0.09520575925707817, "rewards/accuracy_reward": 0.09583333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458432674408, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 617.1521057128906, "epoch": 0.3927028324531925, "grad_norm": 0.14050906896591187, "kl": 0.4312257021665573, "learning_rate": 1.5220935163095534e-05, "loss": 0.1062, "reward": 1.0781250298023224, "reward_std": 0.14153016209602357, "rewards/accuracy_reward": 0.11458333730697631, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.963541692495346, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 606.0312652587891, "epoch": 0.3930228836613858, "grad_norm": 0.12579607963562012, "kl": 0.45440919920802114, "learning_rate": 1.521139996764054e-05, "loss": 0.0825, "reward": 1.0364583551883697, "reward_std": 0.10581081509590148, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 571.8896057128907, "epoch": 0.3933429348695791, "grad_norm": 0.13927984237670898, "kl": 0.44168696030974386, "learning_rate": 1.5201858262913619e-05, "loss": 0.0928, "reward": 1.0677083611488343, "reward_std": 0.15336225517094135, "rewards/accuracy_reward": 0.0937500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 587.508349609375, "epoch": 0.39366298607777245, "grad_norm": 0.16402070224285126, "kl": 0.3885528713464737, "learning_rate": 1.519231006083278e-05, "loss": 0.0758, "reward": 1.030208373069763, "reward_std": 0.1770285289734602, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 574.4666870117187, "epoch": 0.39398303728596573, "grad_norm": 0.08129875361919403, "kl": 0.3222904376685619, "learning_rate": 1.5182755373324162e-05, "loss": 0.0497, "reward": 1.086979192495346, "reward_std": 0.07945878580212593, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 571.8791900634766, "epoch": 0.39430308849415907, "grad_norm": 0.1860627830028534, "kl": 0.4070753358304501, "learning_rate": 1.5173194212321996e-05, "loss": 0.1034, "reward": 1.0458333492279053, "reward_std": 0.12442791275680065, "rewards/accuracy_reward": 0.07500000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 582.0083557128906, "epoch": 0.39462313970235235, "grad_norm": 0.15054498612880707, "kl": 0.22181895673274993, "learning_rate": 1.5163626589768598e-05, "loss": 0.0497, "reward": 1.0265625178813935, "reward_std": 0.09489289149641991, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 597.5416931152344, "epoch": 0.3949431909105457, "grad_norm": 0.10187875479459763, "kl": 0.375130108743906, "learning_rate": 1.5154052517614361e-05, "loss": 0.0653, "reward": 1.0416666924953462, "reward_std": 0.14163720104843378, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500238418579, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 547.6958618164062, "epoch": 0.395263242118739, "grad_norm": 0.169550821185112, "kl": 0.3253509186208248, "learning_rate": 1.5144472007817723e-05, "loss": 0.0672, "reward": 1.0520833551883697, "reward_std": 0.10514021962881089, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 536.4812652587891, "epoch": 0.3955832933269323, "grad_norm": 0.09590303897857666, "kl": 0.2758027367293835, "learning_rate": 1.5134885072345178e-05, "loss": 0.0747, "reward": 1.1109375357627869, "reward_std": 0.15622661411762237, "rewards/accuracy_reward": 0.13541666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 546.527099609375, "epoch": 0.3959033445351256, "grad_norm": 0.1634804904460907, "kl": 0.312994534522295, "learning_rate": 1.512529172317123e-05, "loss": 0.0633, "reward": 1.1515625357627868, "reward_std": 0.1235880684107542, "rewards/accuracy_reward": 0.1791666727513075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958611488343, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 544.0979370117187, "epoch": 0.39622339574331894, "grad_norm": 0.19042591750621796, "kl": 0.4922703742980957, "learning_rate": 1.511569197227841e-05, "loss": 0.0658, "reward": 1.0942708611488343, "reward_std": 0.1376211117953062, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 561.7187622070312, "epoch": 0.3965434469515122, "grad_norm": 0.18415957689285278, "kl": 0.6596296966075897, "learning_rate": 1.5106085831657229e-05, "loss": 0.0724, "reward": 1.1057291984558106, "reward_std": 0.11891843751072884, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 570.3958435058594, "epoch": 0.39686349815970556, "grad_norm": 0.10486699640750885, "kl": 0.23581696003675462, "learning_rate": 1.509647331330619e-05, "loss": 0.0432, "reward": 1.1166666865348815, "reward_std": 0.08925211485475301, "rewards/accuracy_reward": 0.12916666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987500011920929, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 555.5791778564453, "epoch": 0.39718354936789885, "grad_norm": 0.12086888402700424, "kl": 0.337103009223938, "learning_rate": 1.5086854429231763e-05, "loss": 0.0375, "reward": 1.010937511920929, "reward_std": 0.11605828888714313, "rewards/accuracy_reward": 0.02708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 532.839599609375, "epoch": 0.3975036005760922, "grad_norm": 0.16846847534179688, "kl": 0.5034018464386463, "learning_rate": 1.5077229191448357e-05, "loss": 0.0618, "reward": 1.1223958492279054, "reward_std": 0.114602355286479, "rewards/accuracy_reward": 0.14166666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 511.495849609375, "epoch": 0.39782365178428547, "grad_norm": 0.1870306432247162, "kl": 0.19878256246447562, "learning_rate": 1.506759761197833e-05, "loss": 0.0617, "reward": 1.1442708671092987, "reward_std": 0.07291666753590106, "rewards/accuracy_reward": 0.1562500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208432674408, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 538.8666900634765, "epoch": 0.3981437029924788, "grad_norm": 0.25140678882598877, "kl": 0.22704439386725425, "learning_rate": 1.5057959702851953e-05, "loss": 0.0576, "reward": 1.0994792044162751, "reward_std": 0.10983092840760947, "rewards/accuracy_reward": 0.11666667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125178813935, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 535.3166961669922, "epoch": 0.3984637542006721, "grad_norm": 0.210471972823143, "kl": 0.4046620957553387, "learning_rate": 1.5048315476107412e-05, "loss": 0.0813, "reward": 1.1104166805744171, "reward_std": 0.1495097540318966, "rewards/accuracy_reward": 0.13750000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166865348816, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 533.5750152587891, "epoch": 0.39878380540886543, "grad_norm": 0.09761213511228561, "kl": 0.21089787110686303, "learning_rate": 1.5038664943790768e-05, "loss": 0.054, "reward": 1.0968750238418579, "reward_std": 0.08261781334877014, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 554.989599609375, "epoch": 0.3991038566170587, "grad_norm": 0.19265764951705933, "kl": 0.6630521953105927, "learning_rate": 1.5029008117955978e-05, "loss": 0.1042, "reward": 1.1255208730697632, "reward_std": 0.1521202649921179, "rewards/accuracy_reward": 0.15000000335276126, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208492279053, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 541.2479278564454, "epoch": 0.39942390782525206, "grad_norm": 0.1177937313914299, "kl": 0.32113772705197335, "learning_rate": 1.5019345010664845e-05, "loss": 0.0734, "reward": 1.1291666984558106, "reward_std": 0.11449034418910742, "rewards/accuracy_reward": 0.15000000465661287, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 517.6520965576171, "epoch": 0.39974395903344534, "grad_norm": 0.13072729110717773, "kl": 0.4669448517262936, "learning_rate": 1.5009675633987027e-05, "loss": 0.0444, "reward": 1.063541692495346, "reward_std": 0.11234742254018784, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9864583432674408, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 516.2000183105469, "epoch": 0.4000640102416387, "grad_norm": 0.14546598494052887, "kl": 0.41185231059789656, "learning_rate": 1.5000000000000002e-05, "loss": 0.0769, "reward": 1.1276041984558105, "reward_std": 0.0777821946889162, "rewards/accuracy_reward": 0.1416666718199849, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 542.3812744140625, "epoch": 0.40038406144983196, "grad_norm": 0.1501355767250061, "kl": 0.47948102802038195, "learning_rate": 1.4990318120789074e-05, "loss": 0.0584, "reward": 1.060416692495346, "reward_std": 0.06387959867715835, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9875000059604645, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 518.3416870117187, "epoch": 0.4007041126580253, "grad_norm": 0.13536398112773895, "kl": 0.394258227199316, "learning_rate": 1.4980630008447343e-05, "loss": 0.0783, "reward": 1.0604166984558105, "reward_std": 0.11339464448392392, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 536.7666778564453, "epoch": 0.4010241638662186, "grad_norm": 0.08538512885570526, "kl": 0.3164933010935783, "learning_rate": 1.4970935675075694e-05, "loss": 0.0867, "reward": 1.0281250298023223, "reward_std": 0.12806884478777647, "rewards/accuracy_reward": 0.04791666734963655, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083432674408, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 537.8979339599609, "epoch": 0.4013442150744119, "grad_norm": 0.11755944043397903, "kl": 0.36844282820820806, "learning_rate": 1.496123513278279e-05, "loss": 0.0792, "reward": 1.0468750298023224, "reward_std": 0.11137478947639465, "rewards/accuracy_reward": 0.0645833346992731, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 538.1604370117187, "epoch": 0.4016642662826052, "grad_norm": 0.10501649975776672, "kl": 0.3535917893052101, "learning_rate": 1.4951528393685033e-05, "loss": 0.078, "reward": 1.0364583611488343, "reward_std": 0.11245781332254409, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 527.7729370117188, "epoch": 0.40198431749079855, "grad_norm": 0.6395413875579834, "kl": 0.40054913982748985, "learning_rate": 1.4941815469906578e-05, "loss": 0.0792, "reward": 1.0505208492279052, "reward_std": 0.0950919346883893, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 523.3854400634766, "epoch": 0.40230436869899183, "grad_norm": 0.14963221549987793, "kl": 0.31750036850571633, "learning_rate": 1.4932096373579304e-05, "loss": 0.0544, "reward": 1.0114583492279052, "reward_std": 0.12480773292481899, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916865348816, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 553.2250122070312, "epoch": 0.40262441990718517, "grad_norm": 0.09617770463228226, "kl": 0.45049638152122495, "learning_rate": 1.49223711168428e-05, "loss": 0.125, "reward": 1.0140625178813933, "reward_std": 0.128690517693758, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 556.6125183105469, "epoch": 0.40294447111537846, "grad_norm": 0.2407182902097702, "kl": 0.42702504619956017, "learning_rate": 1.4912639711844341e-05, "loss": 0.0967, "reward": 1.1062500476837158, "reward_std": 0.12439418062567711, "rewards/accuracy_reward": 0.13125000409781934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 524.9437683105468, "epoch": 0.4032645223235718, "grad_norm": 0.055207595229148865, "kl": 0.26149168610572815, "learning_rate": 1.490290217073889e-05, "loss": 0.0297, "reward": 1.1635416924953461, "reward_std": 0.08959200419485569, "rewards/accuracy_reward": 0.17708334047347307, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 553.2416839599609, "epoch": 0.4035845735317651, "grad_norm": 0.0995592474937439, "kl": 0.39583138301968573, "learning_rate": 1.4893158505689071e-05, "loss": 0.0881, "reward": 1.0114583492279052, "reward_std": 0.1348001252859831, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 579.0791809082032, "epoch": 0.4039046247399584, "grad_norm": 0.07839322835206985, "kl": 0.2790032118558884, "learning_rate": 1.4883408728865164e-05, "loss": 0.0807, "reward": 1.0427083551883698, "reward_std": 0.1275832900777459, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 554.4375122070312, "epoch": 0.4042246759481517, "grad_norm": 0.06481237709522247, "kl": 0.22605575621128082, "learning_rate": 1.487365285244507e-05, "loss": 0.0793, "reward": 1.0750000298023223, "reward_std": 0.12191822603344918, "rewards/accuracy_reward": 0.10208333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 561.8937774658203, "epoch": 0.40454472715634504, "grad_norm": 0.058921996504068375, "kl": 0.2315479911863804, "learning_rate": 1.4863890888614314e-05, "loss": 0.0253, "reward": 1.2218750417232513, "reward_std": 0.13045966662466527, "rewards/accuracy_reward": 0.24166667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083432674408, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 542.0125183105469, "epoch": 0.4048647783645383, "grad_norm": 0.18803565204143524, "kl": 0.6443677566945553, "learning_rate": 1.4854122849566032e-05, "loss": 0.1322, "reward": 0.9989583611488342, "reward_std": 0.1535223349928856, "rewards/accuracy_reward": 0.03750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583492279053, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 561.0375213623047, "epoch": 0.40518482957273166, "grad_norm": 0.09277193248271942, "kl": 0.31689817234873774, "learning_rate": 1.484434874750094e-05, "loss": 0.0662, "reward": 1.0484375178813934, "reward_std": 0.14467626363039016, "rewards/accuracy_reward": 0.06875000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875059604645, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 558.3333526611328, "epoch": 0.40550488078092495, "grad_norm": 0.11833394318819046, "kl": 0.29800432324409487, "learning_rate": 1.483456859462733e-05, "loss": 0.0553, "reward": 1.079166692495346, "reward_std": 0.12277526557445526, "rewards/accuracy_reward": 0.10000000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 579.3791809082031, "epoch": 0.40582493198911823, "grad_norm": 0.0788242369890213, "kl": 0.3021985150873661, "learning_rate": 1.4824782403161049e-05, "loss": 0.0636, "reward": 1.0885416984558105, "reward_std": 0.11099445223808288, "rewards/accuracy_reward": 0.11041666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.978125, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 546.2083557128906, "epoch": 0.40614498319731157, "grad_norm": 0.15254127979278564, "kl": 0.24624716192483903, "learning_rate": 1.4814990185325488e-05, "loss": 0.0263, "reward": 1.0864583611488343, "reward_std": 0.10864646323025226, "rewards/accuracy_reward": 0.1000000024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 564.1250183105469, "epoch": 0.40646503440550485, "grad_norm": 0.08833687007427216, "kl": 0.3731885127723217, "learning_rate": 1.480519195335157e-05, "loss": 0.122, "reward": 1.0072916924953461, "reward_std": 0.0928686197847128, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583611488343, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 588.7500244140625, "epoch": 0.4067850856136982, "grad_norm": 0.1265832781791687, "kl": 0.41062879338860514, "learning_rate": 1.4795387719477719e-05, "loss": 0.0834, "reward": 1.1348958671092988, "reward_std": 0.1514855232089758, "rewards/accuracy_reward": 0.16666666995733975, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291924953461, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 583.0958465576172, "epoch": 0.4071051368218915, "grad_norm": 0.10724162310361862, "kl": 0.4103553980588913, "learning_rate": 1.4785577495949866e-05, "loss": 0.0846, "reward": 1.045833373069763, "reward_std": 0.15014931112527846, "rewards/accuracy_reward": 0.07083333600312472, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 555.7375183105469, "epoch": 0.4074251880300848, "grad_norm": 0.11321690678596497, "kl": 0.6172791600227356, "learning_rate": 1.4775761295021418e-05, "loss": 0.1314, "reward": 1.1250000238418578, "reward_std": 0.1790860690176487, "rewards/accuracy_reward": 0.15416666883975266, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333492279053, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 566.4604370117188, "epoch": 0.4077452392382781, "grad_norm": 0.0912218764424324, "kl": 0.37791914120316505, "learning_rate": 1.4765939128953255e-05, "loss": 0.0626, "reward": 1.0703125238418578, "reward_std": 0.16757386401295662, "rewards/accuracy_reward": 0.09375000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625059604645, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 591.5937744140625, "epoch": 0.40806529044647144, "grad_norm": 0.1103522852063179, "kl": 0.23164157569408417, "learning_rate": 1.4756111010013694e-05, "loss": 0.046, "reward": 1.0833333611488343, "reward_std": 0.1385488674044609, "rewards/accuracy_reward": 0.10000000521540642, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.981250011920929, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 567.1708526611328, "epoch": 0.4083853416546647, "grad_norm": 0.1879657357931137, "kl": 0.4369122177362442, "learning_rate": 1.47462769504785e-05, "loss": 0.1146, "reward": 1.0880208492279053, "reward_std": 0.11164197325706482, "rewards/accuracy_reward": 0.11041666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 608.6062622070312, "epoch": 0.40870539286285806, "grad_norm": 0.22609193623065948, "kl": 0.8324251100420952, "learning_rate": 1.473643696263085e-05, "loss": 0.1212, "reward": 0.9630208373069763, "reward_std": 0.1323666602373123, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375059604645, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 595.5500305175781, "epoch": 0.40902544407105135, "grad_norm": 0.14130529761314392, "kl": 0.5213746406137943, "learning_rate": 1.4726591058761336e-05, "loss": 0.0789, "reward": 1.1119791984558105, "reward_std": 0.1154967736452818, "rewards/accuracy_reward": 0.13750000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 591.8812622070312, "epoch": 0.4093454952792447, "grad_norm": 0.12913070619106293, "kl": 0.6176031097769737, "learning_rate": 1.4716739251167931e-05, "loss": 0.1167, "reward": 0.9677083492279053, "reward_std": 0.12283567264676094, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 590.9333618164062, "epoch": 0.40966554648743797, "grad_norm": 0.22252824902534485, "kl": 0.608339787274599, "learning_rate": 1.470688155215598e-05, "loss": 0.1006, "reward": 1.0734375417232513, "reward_std": 0.18009853959083558, "rewards/accuracy_reward": 0.10208333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 563.970849609375, "epoch": 0.4099855976956313, "grad_norm": 0.2188696563243866, "kl": 0.6703398540616036, "learning_rate": 1.4697017974038192e-05, "loss": 0.1193, "reward": 0.9994791984558106, "reward_std": 0.12184309475123882, "rewards/accuracy_reward": 0.037500002048909666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791865348816, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 583.2937683105469, "epoch": 0.4103056489038246, "grad_norm": 0.46435990929603577, "kl": 0.5058012694120407, "learning_rate": 1.4687148529134621e-05, "loss": 0.1268, "reward": 0.9937500178813934, "reward_std": 0.16947735473513603, "rewards/accuracy_reward": 0.035416668839752675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333492279053, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 608.6354370117188, "epoch": 0.41062570011201793, "grad_norm": 0.08456681668758392, "kl": 0.2596034061163664, "learning_rate": 1.467727322977264e-05, "loss": 0.0533, "reward": 1.015625011920929, "reward_std": 0.08625244870781898, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250059604645, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 598.0146057128907, "epoch": 0.4109457513202112, "grad_norm": 0.14722603559494019, "kl": 0.26935129314661027, "learning_rate": 1.4667392088286946e-05, "loss": 0.0653, "reward": 1.1036458671092988, "reward_std": 0.12447739690542221, "rewards/accuracy_reward": 0.12500000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458432674408, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 541.1020935058593, "epoch": 0.41126580252840456, "grad_norm": 0.1227230504155159, "kl": 0.3881824046373367, "learning_rate": 1.4657505117019523e-05, "loss": 0.1171, "reward": 1.140625035762787, "reward_std": 0.1513745330274105, "rewards/accuracy_reward": 0.16875000558793546, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 550.5791900634765, "epoch": 0.41158585373659784, "grad_norm": 0.04737142100930214, "kl": 0.231598449498415, "learning_rate": 1.4647612328319645e-05, "loss": 0.0141, "reward": 1.1666666984558105, "reward_std": 0.09083670191466808, "rewards/accuracy_reward": 0.17916667461395264, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9854166805744171, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 596.2229370117187, "epoch": 0.4119059049447912, "grad_norm": 0.10313890129327774, "kl": 0.2947948418557644, "learning_rate": 1.4637713734543844e-05, "loss": 0.0591, "reward": 1.092187523841858, "reward_std": 0.07089405842125415, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 551.3896026611328, "epoch": 0.41222595615298446, "grad_norm": 0.12789444625377655, "kl": 0.3417969450354576, "learning_rate": 1.4627809348055908e-05, "loss": 0.0693, "reward": 1.0427083730697633, "reward_std": 0.12029938250780106, "rewards/accuracy_reward": 0.06875000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 548.1062683105469, "epoch": 0.4125460073611778, "grad_norm": 0.06556902825832367, "kl": 0.1632651649415493, "learning_rate": 1.461789918122686e-05, "loss": 0.0484, "reward": 1.0713542103767395, "reward_std": 0.10263727717101574, "rewards/accuracy_reward": 0.08541666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 608.1187744140625, "epoch": 0.4128660585693711, "grad_norm": 0.03857985883951187, "kl": 0.14200967624783517, "learning_rate": 1.460798324643494e-05, "loss": 0.0268, "reward": 1.1416666984558106, "reward_std": 0.09959968477487564, "rewards/accuracy_reward": 0.15000000577419997, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666746139526, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 553.018765258789, "epoch": 0.4131861097775644, "grad_norm": 0.13862329721450806, "kl": 0.49638293087482455, "learning_rate": 1.4598061556065598e-05, "loss": 0.0776, "reward": 1.066666692495346, "reward_std": 0.13814237490296363, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 546.9104431152343, "epoch": 0.4135061609857577, "grad_norm": 0.1622048169374466, "kl": 0.22472710385918618, "learning_rate": 1.4588134122511467e-05, "loss": 0.0493, "reward": 1.059375035762787, "reward_std": 0.11591703221201896, "rewards/accuracy_reward": 0.0750000024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 576.279183959961, "epoch": 0.41382621219395105, "grad_norm": 0.0760549008846283, "kl": 0.20149843543767929, "learning_rate": 1.457820095817236e-05, "loss": 0.0466, "reward": 1.0385416865348815, "reward_std": 0.09624491930007935, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750059604645, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 551.4354278564454, "epoch": 0.41414626340214433, "grad_norm": 0.07412170618772507, "kl": 0.1810336247086525, "learning_rate": 1.4568262075455237e-05, "loss": 0.0198, "reward": 1.1057291984558106, "reward_std": 0.07921138815581799, "rewards/accuracy_reward": 0.11458333730697631, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458432674408, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 544.7146057128906, "epoch": 0.41446631461033767, "grad_norm": 0.11219371855258942, "kl": 0.3419033609330654, "learning_rate": 1.4558317486774216e-05, "loss": 0.0647, "reward": 1.0661458730697633, "reward_std": 0.09991935733705759, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9869791865348816, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 548.9521026611328, "epoch": 0.41478636581853096, "grad_norm": 0.10580622404813766, "kl": 0.1414179392158985, "learning_rate": 1.4548367204550526e-05, "loss": 0.0409, "reward": 1.0713541865348817, "reward_std": 0.09464184809476137, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9921875119209289, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 562.620849609375, "epoch": 0.4151064170267243, "grad_norm": 0.06379967927932739, "kl": 0.1508323907852173, "learning_rate": 1.4538411241212518e-05, "loss": 0.0404, "reward": 1.0572916924953462, "reward_std": 0.1208875872194767, "rewards/accuracy_reward": 0.06666666977107524, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990625011920929, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 565.933349609375, "epoch": 0.4154264682349176, "grad_norm": 0.18415191769599915, "kl": 0.5200513236224651, "learning_rate": 1.4528449609195639e-05, "loss": 0.0552, "reward": 1.0677083611488343, "reward_std": 0.12492301575839519, "rewards/accuracy_reward": 0.09166667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 527.5270965576171, "epoch": 0.4157465194431109, "grad_norm": 0.09631240367889404, "kl": 0.23828165605664253, "learning_rate": 1.4518482320942409e-05, "loss": 0.0598, "reward": 1.021875023841858, "reward_std": 0.11106022223830223, "rewards/accuracy_reward": 0.03750000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750059604645, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 622.9687622070312, "epoch": 0.4160665706513042, "grad_norm": 0.05873395502567291, "kl": 0.22104017101228238, "learning_rate": 1.4508509388902421e-05, "loss": 0.0281, "reward": 1.1083333551883698, "reward_std": 0.11215355768799781, "rewards/accuracy_reward": 0.12708333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 593.1458557128906, "epoch": 0.41638662185949754, "grad_norm": 0.07611161470413208, "kl": 0.42822214812040327, "learning_rate": 1.4498530825532309e-05, "loss": 0.0433, "reward": 1.0796875357627869, "reward_std": 0.11318734250962734, "rewards/accuracy_reward": 0.10000000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 600.6083435058594, "epoch": 0.4167066730676908, "grad_norm": 0.08425391465425491, "kl": 0.2452217899262905, "learning_rate": 1.448854664329575e-05, "loss": 0.056, "reward": 1.0208333551883697, "reward_std": 0.12165859192609788, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 595.1333557128906, "epoch": 0.41702672427588416, "grad_norm": 0.07507262378931046, "kl": 0.16818516626954078, "learning_rate": 1.4478556854663435e-05, "loss": 0.0294, "reward": 1.0869791865348817, "reward_std": 0.1346562247723341, "rewards/accuracy_reward": 0.09791667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9890625059604645, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 604.5583557128906, "epoch": 0.41734677548407745, "grad_norm": 0.06492973864078522, "kl": 0.2809316359460354, "learning_rate": 1.4468561472113053e-05, "loss": 0.0517, "reward": 0.9942708492279053, "reward_std": 0.08789652790874243, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 575.5625152587891, "epoch": 0.4176668266922708, "grad_norm": 0.2417496144771576, "kl": 0.35190742164850236, "learning_rate": 1.4458560508129286e-05, "loss": 0.0777, "reward": 1.008854192495346, "reward_std": 0.1146691657602787, "rewards/accuracy_reward": 0.029166667349636556, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875178813934, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 572.8854339599609, "epoch": 0.41798687790046407, "grad_norm": 0.07067929953336716, "kl": 0.43219382539391515, "learning_rate": 1.444855397520379e-05, "loss": 0.0774, "reward": 1.0885416984558105, "reward_std": 0.19529549181461334, "rewards/accuracy_reward": 0.1166666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 585.2937622070312, "epoch": 0.4183069291086574, "grad_norm": 0.17449365556240082, "kl": 0.4062021173536777, "learning_rate": 1.4438541885835167e-05, "loss": 0.0993, "reward": 1.0052083671092986, "reward_std": 0.1413590393960476, "rewards/accuracy_reward": 0.033333334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 566.5812622070313, "epoch": 0.4186269803168507, "grad_norm": 0.11451167613267899, "kl": 0.30510389655828474, "learning_rate": 1.4428524252528968e-05, "loss": 0.0636, "reward": 1.0005208551883698, "reward_std": 0.07861845903098583, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 587.8896026611328, "epoch": 0.41894703152504403, "grad_norm": 0.12974369525909424, "kl": 0.3899438038468361, "learning_rate": 1.4418501087797667e-05, "loss": 0.0522, "reward": 1.029166692495346, "reward_std": 0.0809813478961587, "rewards/accuracy_reward": 0.04583333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 591.8771057128906, "epoch": 0.4192670827332373, "grad_norm": 0.370892196893692, "kl": 0.6947083935141564, "learning_rate": 1.440847240416064e-05, "loss": 0.0718, "reward": 1.0385416984558105, "reward_std": 0.14330552741885186, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 606.027099609375, "epoch": 0.4195871339414306, "grad_norm": 0.20165027678012848, "kl": 0.5955793671309948, "learning_rate": 1.4398438214144168e-05, "loss": 0.1082, "reward": 1.053645873069763, "reward_std": 0.1770230144262314, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958551883698, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 561.0750183105469, "epoch": 0.41990718514962394, "grad_norm": 0.31423094868659973, "kl": 0.6826820693910122, "learning_rate": 1.4388398530281403e-05, "loss": 0.0712, "reward": 1.0364583551883697, "reward_std": 0.12629178818315268, "rewards/accuracy_reward": 0.05625000223517418, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9781250119209289, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 591.3646057128906, "epoch": 0.4202272363578172, "grad_norm": 0.30876481533050537, "kl": 0.5968435898423194, "learning_rate": 1.4378353365112353e-05, "loss": 0.1111, "reward": 1.0307291805744172, "reward_std": 0.1586002826690674, "rewards/accuracy_reward": 0.0645833345130086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 577.1750244140625, "epoch": 0.42054728756601056, "grad_norm": 0.1654668152332306, "kl": 0.26816257983446123, "learning_rate": 1.436830273118389e-05, "loss": 0.0452, "reward": 1.0468750298023224, "reward_std": 0.11725292392075062, "rewards/accuracy_reward": 0.06666666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083551883698, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 577.8916839599609, "epoch": 0.42086733877420385, "grad_norm": 0.12751136720180511, "kl": 0.33656053617596626, "learning_rate": 1.4358246641049696e-05, "loss": 0.0896, "reward": 1.046354204416275, "reward_std": 0.14575700759887694, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375238418579, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 563.2708618164063, "epoch": 0.4211873899823972, "grad_norm": 0.20244768261909485, "kl": 0.6099851727485657, "learning_rate": 1.4348185107270282e-05, "loss": 0.0776, "reward": 1.0927083671092988, "reward_std": 0.16826162450015544, "rewards/accuracy_reward": 0.11875000316649675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 578.8937744140625, "epoch": 0.42150744119059047, "grad_norm": 0.13575513660907745, "kl": 0.4396487962454557, "learning_rate": 1.4338118142412956e-05, "loss": 0.0572, "reward": 1.1109375298023223, "reward_std": 0.15248046070337296, "rewards/accuracy_reward": 0.13750000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 617.4583557128906, "epoch": 0.4218274923987838, "grad_norm": 0.07689894735813141, "kl": 0.4052378758788109, "learning_rate": 1.4328045759051805e-05, "loss": 0.0527, "reward": 1.017187523841858, "reward_std": 0.1121408674865961, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 582.8083557128906, "epoch": 0.4221475436069771, "grad_norm": 0.12368390709161758, "kl": 0.47785288393497466, "learning_rate": 1.4317967969767688e-05, "loss": 0.0957, "reward": 1.0130208492279054, "reward_std": 0.16585390493273736, "rewards/accuracy_reward": 0.054166669771075246, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541746139526, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 592.3416870117187, "epoch": 0.42246759481517043, "grad_norm": 0.20103225111961365, "kl": 0.5053670577704906, "learning_rate": 1.4307884787148216e-05, "loss": 0.0999, "reward": 1.0145833671092988, "reward_std": 0.2102899357676506, "rewards/accuracy_reward": 0.06458333618938923, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9500000238418579, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 570.6166809082031, "epoch": 0.4227876460233637, "grad_norm": 0.12856552004814148, "kl": 0.45006143152713773, "learning_rate": 1.4297796223787734e-05, "loss": 0.0796, "reward": 1.0359375298023223, "reward_std": 0.16981710996478797, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375178813935, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 582.7375213623047, "epoch": 0.42310769723155706, "grad_norm": 0.26487281918525696, "kl": 0.5033496886491775, "learning_rate": 1.4287702292287308e-05, "loss": 0.115, "reward": 1.142187523841858, "reward_std": 0.18477849662303925, "rewards/accuracy_reward": 0.18333333563059567, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9588541805744171, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 579.2937713623047, "epoch": 0.42342774843975034, "grad_norm": 0.08229225873947144, "kl": 0.34848415181040765, "learning_rate": 1.4277603005254715e-05, "loss": 0.0462, "reward": 1.048437523841858, "reward_std": 0.11120131872594356, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708492279053, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 574.0687713623047, "epoch": 0.4237477996479437, "grad_norm": 0.2630603015422821, "kl": 0.40130707398056986, "learning_rate": 1.4267498375304417e-05, "loss": 0.0867, "reward": 1.148437535762787, "reward_std": 0.1714845359325409, "rewards/accuracy_reward": 0.16875000353902578, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 614.0854309082031, "epoch": 0.42406785085613696, "grad_norm": 0.2801266610622406, "kl": 0.7189634054899215, "learning_rate": 1.425738841505754e-05, "loss": 0.1297, "reward": 1.0114583671092987, "reward_std": 0.16967907920479774, "rewards/accuracy_reward": 0.05208333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750238418579, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 583.1020965576172, "epoch": 0.4243879020643303, "grad_norm": 0.5371996164321899, "kl": 0.9050520665943622, "learning_rate": 1.4247273137141888e-05, "loss": 0.15, "reward": 1.0114583492279052, "reward_std": 0.16600136533379556, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250119209289, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 582.4521026611328, "epoch": 0.4247079532725236, "grad_norm": 0.36296284198760986, "kl": 0.7386391490697861, "learning_rate": 1.4237152554191889e-05, "loss": 0.0947, "reward": 0.9786458432674408, "reward_std": 0.16325723454356195, "rewards/accuracy_reward": 0.02291666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291746139527, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 580.6958557128906, "epoch": 0.4250280044807169, "grad_norm": 0.19977205991744995, "kl": 0.4357809633016586, "learning_rate": 1.422702667884861e-05, "loss": 0.1095, "reward": 1.0614583492279053, "reward_std": 0.1346949001774192, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 585.3979431152344, "epoch": 0.4253480556889102, "grad_norm": 0.19443321228027344, "kl": 0.5088992670178414, "learning_rate": 1.421689552375972e-05, "loss": 0.114, "reward": 1.0536458551883698, "reward_std": 0.16536558866500856, "rewards/accuracy_reward": 0.09166666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791865348816, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 556.1500244140625, "epoch": 0.42566810689710355, "grad_norm": 0.10612791031599045, "kl": 0.22260906249284745, "learning_rate": 1.4206759101579481e-05, "loss": 0.0702, "reward": 1.0682291746139527, "reward_std": 0.12378347031772137, "rewards/accuracy_reward": 0.08750000335276127, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9786458492279053, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 569.0666778564453, "epoch": 0.42598815810529683, "grad_norm": 0.30102500319480896, "kl": 0.4394591063261032, "learning_rate": 1.4196617424968744e-05, "loss": 0.0728, "reward": 1.051562535762787, "reward_std": 0.13498535864055156, "rewards/accuracy_reward": 0.07500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 556.0229370117188, "epoch": 0.42630820931349017, "grad_norm": 0.17076443135738373, "kl": 0.2940096914768219, "learning_rate": 1.4186470506594919e-05, "loss": 0.0614, "reward": 1.027604204416275, "reward_std": 0.12638225294649602, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208492279053, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 572.5958435058594, "epoch": 0.42662826052168346, "grad_norm": 0.2015293687582016, "kl": 0.1857258200645447, "learning_rate": 1.4176318359131955e-05, "loss": 0.0245, "reward": 1.0583333492279052, "reward_std": 0.09938515722751617, "rewards/accuracy_reward": 0.07291666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 553.6291870117187, "epoch": 0.4269483117298768, "grad_norm": 0.14335834980010986, "kl": 0.2323061317205429, "learning_rate": 1.4166160995260342e-05, "loss": 0.0585, "reward": 1.136979192495346, "reward_std": 0.13655745461583138, "rewards/accuracy_reward": 0.16041666697710752, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625059604645, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 567.8916839599609, "epoch": 0.4272683629380701, "grad_norm": 0.12033113837242126, "kl": 0.2362310327589512, "learning_rate": 1.4155998427667083e-05, "loss": 0.0372, "reward": 1.0317708492279052, "reward_std": 0.11146253608167171, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 594.7312744140625, "epoch": 0.4275884141462634, "grad_norm": 0.1028214618563652, "kl": 0.3947672449052334, "learning_rate": 1.414583066904568e-05, "loss": 0.0747, "reward": 1.0239583611488343, "reward_std": 0.08998498003929853, "rewards/accuracy_reward": 0.04375000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083551883698, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 584.7854431152343, "epoch": 0.4279084653544567, "grad_norm": 0.11494532227516174, "kl": 0.3638791225850582, "learning_rate": 1.4135657732096118e-05, "loss": 0.1045, "reward": 1.0416666984558105, "reward_std": 0.12086176574230194, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 555.3312683105469, "epoch": 0.42822851656265004, "grad_norm": 0.09641270339488983, "kl": 0.22046290934085847, "learning_rate": 1.4125479629524849e-05, "loss": 0.0818, "reward": 1.0807291984558105, "reward_std": 0.1261975012719631, "rewards/accuracy_reward": 0.10625000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 569.4583526611328, "epoch": 0.4285485677708433, "grad_norm": 0.07313210517168045, "kl": 0.35226295664906504, "learning_rate": 1.411529637404478e-05, "loss": 0.0564, "reward": 1.121354204416275, "reward_std": 0.14535944275557994, "rewards/accuracy_reward": 0.14791666958481073, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 556.185433959961, "epoch": 0.42886861897903666, "grad_norm": 0.09117227047681808, "kl": 0.3916263036429882, "learning_rate": 1.4105107978375256e-05, "loss": 0.093, "reward": 0.986979192495346, "reward_std": 0.10924595408141613, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 548.2000244140625, "epoch": 0.42918867018722995, "grad_norm": 0.17177411913871765, "kl": 0.3724663570523262, "learning_rate": 1.409491445524204e-05, "loss": 0.0719, "reward": 1.139062523841858, "reward_std": 0.15040809139609337, "rewards/accuracy_reward": 0.15833333749324083, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291746139526, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 554.383349609375, "epoch": 0.4295087213954233, "grad_norm": 0.248824343085289, "kl": 0.46576319485902784, "learning_rate": 1.4084715817377292e-05, "loss": 0.0807, "reward": 1.0031250238418579, "reward_std": 0.1396949838846922, "rewards/accuracy_reward": 0.02916666753590107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 546.879183959961, "epoch": 0.42982877260361657, "grad_norm": 0.15470728278160095, "kl": 0.295084173977375, "learning_rate": 1.4074512077519571e-05, "loss": 0.0712, "reward": 1.148437535762787, "reward_std": 0.11446435116231442, "rewards/accuracy_reward": 0.170833339355886, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 544.7166778564454, "epoch": 0.4301488238118099, "grad_norm": 0.11134085059165955, "kl": 0.4050963319838047, "learning_rate": 1.4064303248413808e-05, "loss": 0.0674, "reward": 1.0354166865348815, "reward_std": 0.1173494003713131, "rewards/accuracy_reward": 0.05208333618938923, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333373069764, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 556.3625183105469, "epoch": 0.4304688750200032, "grad_norm": 0.1598719358444214, "kl": 0.3194881580770016, "learning_rate": 1.4054089342811286e-05, "loss": 0.0722, "reward": 1.0427083492279052, "reward_std": 0.11805956549942494, "rewards/accuracy_reward": 0.0645833346992731, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 557.6125091552734, "epoch": 0.43078892622819653, "grad_norm": 0.06611867994070053, "kl": 0.24079276397824287, "learning_rate": 1.4043870373469628e-05, "loss": 0.0698, "reward": 1.0916666865348816, "reward_std": 0.12190479338169098, "rewards/accuracy_reward": 0.1083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 547.9312622070313, "epoch": 0.4311089774363898, "grad_norm": 0.16819950938224792, "kl": 0.30558022633194926, "learning_rate": 1.4033646353152786e-05, "loss": 0.05, "reward": 1.0427083551883698, "reward_std": 0.14329716470092535, "rewards/accuracy_reward": 0.0604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 544.6791839599609, "epoch": 0.43142902864458316, "grad_norm": 0.06702463328838348, "kl": 0.24792197570204735, "learning_rate": 1.4023417294631019e-05, "loss": 0.0523, "reward": 1.0468750298023224, "reward_std": 0.0981780519708991, "rewards/accuracy_reward": 0.06250000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 552.0000152587891, "epoch": 0.43174907985277644, "grad_norm": 0.1554039567708969, "kl": 0.26410412788391113, "learning_rate": 1.401318321068088e-05, "loss": 0.0761, "reward": 1.0552083432674408, "reward_std": 0.13532831519842148, "rewards/accuracy_reward": 0.0791666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416746139526, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 506.9354309082031, "epoch": 0.4320691310609698, "grad_norm": 0.07704272121191025, "kl": 0.310164712369442, "learning_rate": 1.40029441140852e-05, "loss": 0.034, "reward": 1.0609375298023225, "reward_std": 0.13052574992179872, "rewards/accuracy_reward": 0.08125000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 583.5687683105468, "epoch": 0.43238918226916306, "grad_norm": 0.07124790549278259, "kl": 0.33384485468268393, "learning_rate": 1.3992700017633063e-05, "loss": 0.0371, "reward": 1.0890625357627868, "reward_std": 0.15975245274603367, "rewards/accuracy_reward": 0.10833333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 547.70419921875, "epoch": 0.4327092334773564, "grad_norm": 0.05800170451402664, "kl": 0.24518816471099852, "learning_rate": 1.3982450934119808e-05, "loss": 0.0494, "reward": 1.035937535762787, "reward_std": 0.12522053439170122, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9817708492279053, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 545.5833526611328, "epoch": 0.4330292846855497, "grad_norm": 0.23074017465114594, "kl": 0.4324066393077374, "learning_rate": 1.3972196876347005e-05, "loss": 0.0881, "reward": 1.1557292103767396, "reward_std": 0.10613631978631019, "rewards/accuracy_reward": 0.17291667219251394, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 580.9500213623047, "epoch": 0.433349335893743, "grad_norm": 0.20919708907604218, "kl": 0.3582280553877354, "learning_rate": 1.3961937857122418e-05, "loss": 0.0597, "reward": 1.097916692495346, "reward_std": 0.10435024127364159, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000059604644, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 528.3750213623047, "epoch": 0.4336693871019363, "grad_norm": 0.16085828840732574, "kl": 0.47188392728567125, "learning_rate": 1.3951673889260033e-05, "loss": 0.0707, "reward": 1.1635417103767396, "reward_std": 0.1453533548861742, "rewards/accuracy_reward": 0.18333334047347308, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 573.4375183105469, "epoch": 0.4339894383101296, "grad_norm": 0.09656932204961777, "kl": 0.3448480650782585, "learning_rate": 1.394140498558e-05, "loss": 0.0775, "reward": 0.9875000178813934, "reward_std": 0.12802811972796918, "rewards/accuracy_reward": 0.01666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333551883698, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 563.3520965576172, "epoch": 0.43430948951832293, "grad_norm": 0.12296207249164581, "kl": 0.41478071361780167, "learning_rate": 1.3931131158908644e-05, "loss": 0.0401, "reward": 1.014583373069763, "reward_std": 0.12463017739355564, "rewards/accuracy_reward": 0.0375000013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833611488342, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 557.7896118164062, "epoch": 0.4346295407265162, "grad_norm": 0.11532151699066162, "kl": 0.5874006308615207, "learning_rate": 1.392085242207843e-05, "loss": 0.1042, "reward": 0.9869791865348816, "reward_std": 0.15448905751109124, "rewards/accuracy_reward": 0.025000000931322576, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791805744171, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 548.7541900634766, "epoch": 0.43494959193470956, "grad_norm": 0.16093531250953674, "kl": 0.7109584361314774, "learning_rate": 1.391056878792796e-05, "loss": 0.1082, "reward": 1.0348958671092987, "reward_std": 0.18050943203270436, "rewards/accuracy_reward": 0.07291666958481073, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791865348816, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 579.0562744140625, "epoch": 0.43526964314290284, "grad_norm": 0.13128206133842468, "kl": 0.5672376766800881, "learning_rate": 1.3900280269301957e-05, "loss": 0.0712, "reward": 1.043750035762787, "reward_std": 0.17327113449573517, "rewards/accuracy_reward": 0.07500000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500178813935, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 554.2541809082031, "epoch": 0.4355896943510962, "grad_norm": 0.09163526445627213, "kl": 0.29370440244674684, "learning_rate": 1.3889986879051242e-05, "loss": 0.0631, "reward": 1.1031250357627869, "reward_std": 0.13333576880395412, "rewards/accuracy_reward": 0.1250000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 580.808349609375, "epoch": 0.43590974555928946, "grad_norm": 0.14829999208450317, "kl": 0.40503218322992324, "learning_rate": 1.3879688630032717e-05, "loss": 0.0806, "reward": 1.079166704416275, "reward_std": 0.1650255832821131, "rewards/accuracy_reward": 0.10833333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333551883698, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 580.5041900634766, "epoch": 0.4362297967674828, "grad_norm": 0.23036116361618042, "kl": 0.4940149299800396, "learning_rate": 1.3869385535109358e-05, "loss": 0.0977, "reward": 1.0208333492279054, "reward_std": 0.18670744299888611, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 578.0000244140625, "epoch": 0.4365498479756761, "grad_norm": 0.07721938192844391, "kl": 0.32006452083587644, "learning_rate": 1.385907760715019e-05, "loss": 0.0435, "reward": 1.0260416865348816, "reward_std": 0.10851494073867798, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 587.4312744140625, "epoch": 0.4368698991838694, "grad_norm": 0.07113399356603622, "kl": 0.3193306714296341, "learning_rate": 1.3848764859030281e-05, "loss": 0.0463, "reward": 1.076562523841858, "reward_std": 0.14118604324758052, "rewards/accuracy_reward": 0.09166666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 574.0958587646485, "epoch": 0.4371899503920627, "grad_norm": 0.0633530393242836, "kl": 0.27230303883552553, "learning_rate": 1.3838447303630713e-05, "loss": 0.0554, "reward": 1.0484375298023223, "reward_std": 0.12122409045696259, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 560.2021026611328, "epoch": 0.43751000160025605, "grad_norm": 0.0803065150976181, "kl": 0.3396205462515354, "learning_rate": 1.3828124953838574e-05, "loss": 0.0545, "reward": 1.0364583671092986, "reward_std": 0.12746839523315429, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 574.239599609375, "epoch": 0.43783005280844933, "grad_norm": 0.10097281634807587, "kl": 0.3694083333015442, "learning_rate": 1.381779782254694e-05, "loss": 0.0711, "reward": 1.115625023841858, "reward_std": 0.17042535543441772, "rewards/accuracy_reward": 0.1375000026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 544.095849609375, "epoch": 0.43815010401664267, "grad_norm": 0.1160140186548233, "kl": 0.41535960510373116, "learning_rate": 1.3807465922654863e-05, "loss": 0.0712, "reward": 0.9838541805744171, "reward_std": 0.10553747303783893, "rewards/accuracy_reward": 0.012500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 561.1791839599609, "epoch": 0.43847015522483596, "grad_norm": 0.1678745597600937, "kl": 0.25593645721673963, "learning_rate": 1.3797129267067348e-05, "loss": 0.0589, "reward": 1.0921875357627868, "reward_std": 0.14206044673919677, "rewards/accuracy_reward": 0.11041667088866233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708373069763, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 580.6916809082031, "epoch": 0.4387902064330293, "grad_norm": 0.0591396726667881, "kl": 0.24178946688771247, "learning_rate": 1.378678786869534e-05, "loss": 0.0485, "reward": 1.0317708492279052, "reward_std": 0.1332608327269554, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041746139527, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 562.7541900634766, "epoch": 0.4391102576412226, "grad_norm": 0.11310972273349762, "kl": 0.44280795007944107, "learning_rate": 1.3776441740455706e-05, "loss": 0.0808, "reward": 1.0494792103767394, "reward_std": 0.162718590721488, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 576.5375183105468, "epoch": 0.4394303088494159, "grad_norm": 0.7537197470664978, "kl": 0.926392175257206, "learning_rate": 1.376609089527123e-05, "loss": 0.1126, "reward": 1.079166692495346, "reward_std": 0.13849810790270567, "rewards/accuracy_reward": 0.11875000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166865348815, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 534.4771026611328, "epoch": 0.4397503600576092, "grad_norm": 0.17892588675022125, "kl": 0.3169956490397453, "learning_rate": 1.3755735346070576e-05, "loss": 0.0841, "reward": 1.1223958790302277, "reward_std": 0.18202281817793847, "rewards/accuracy_reward": 0.14791667200624942, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 594.9979370117187, "epoch": 0.44007041126580254, "grad_norm": 0.1239943578839302, "kl": 0.5117902666330337, "learning_rate": 1.374537510578829e-05, "loss": 0.0844, "reward": 1.0890625357627868, "reward_std": 0.13241406325250865, "rewards/accuracy_reward": 0.1166666703298688, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958492279052, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 557.6083526611328, "epoch": 0.4403904624739958, "grad_norm": 0.12511348724365234, "kl": 0.4907986491918564, "learning_rate": 1.3735010187364776e-05, "loss": 0.0473, "reward": 0.9937500119209289, "reward_std": 0.13010377399623393, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 586.4604370117188, "epoch": 0.44071051368218916, "grad_norm": 0.11388861387968063, "kl": 0.2962417095899582, "learning_rate": 1.3724640603746282e-05, "loss": 0.0731, "reward": 1.1531250298023223, "reward_std": 0.1589014722034335, "rewards/accuracy_reward": 0.1812500048428774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 572.4437683105468, "epoch": 0.44103056489038245, "grad_norm": 0.07961485534906387, "kl": 0.2853081613779068, "learning_rate": 1.3714266367884883e-05, "loss": 0.0625, "reward": 1.008854204416275, "reward_std": 0.1624489687383175, "rewards/accuracy_reward": 0.0354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375178813934, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 573.8771057128906, "epoch": 0.4413506160985758, "grad_norm": 0.2938500940799713, "kl": 0.4174025818705559, "learning_rate": 1.3703887492738463e-05, "loss": 0.1021, "reward": 1.0604166865348816, "reward_std": 0.12792656924575568, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500178813935, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 561.7812713623047, "epoch": 0.44167066730676907, "grad_norm": 0.19335335493087769, "kl": 0.3752726331353188, "learning_rate": 1.36935039912707e-05, "loss": 0.0885, "reward": 1.1031250298023223, "reward_std": 0.16487400010228156, "rewards/accuracy_reward": 0.1375000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 560.3437622070312, "epoch": 0.4419907185149624, "grad_norm": 0.1562214195728302, "kl": 0.46785186752676966, "learning_rate": 1.3683115876451054e-05, "loss": 0.0611, "reward": 1.1208333492279052, "reward_std": 0.16405072771012782, "rewards/accuracy_reward": 0.15000000428408383, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 566.9271057128906, "epoch": 0.4423107697231557, "grad_norm": 0.1530592143535614, "kl": 0.3877778798341751, "learning_rate": 1.3672723161254748e-05, "loss": 0.0817, "reward": 1.063541692495346, "reward_std": 0.18836085498332977, "rewards/accuracy_reward": 0.10000000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416865348816, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 592.8270874023438, "epoch": 0.44263082093134903, "grad_norm": 0.23629511892795563, "kl": 0.3606575734913349, "learning_rate": 1.3662325858662743e-05, "loss": 0.0675, "reward": 1.1322916924953461, "reward_std": 0.10985236279666424, "rewards/accuracy_reward": 0.1541666707023978, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 586.9500274658203, "epoch": 0.4429508721395423, "grad_norm": 0.1339237093925476, "kl": 0.508060896396637, "learning_rate": 1.3651923981661741e-05, "loss": 0.1255, "reward": 0.9791666984558105, "reward_std": 0.16428967230021954, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 610.7604370117188, "epoch": 0.44327092334773566, "grad_norm": 0.07876008003950119, "kl": 0.30484682992100715, "learning_rate": 1.3641517543244152e-05, "loss": 0.0557, "reward": 1.0671875178813934, "reward_std": 0.13168583028018474, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708611488343, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 561.7666870117188, "epoch": 0.44359097455592894, "grad_norm": 0.22591634094715118, "kl": 0.41740057840943334, "learning_rate": 1.363110655640808e-05, "loss": 0.0997, "reward": 1.0437500298023223, "reward_std": 0.13870418183505534, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 584.49794921875, "epoch": 0.4439110257641223, "grad_norm": 0.34502559900283813, "kl": 0.41770399175584316, "learning_rate": 1.3620691034157314e-05, "loss": 0.0936, "reward": 1.0588541865348815, "reward_std": 0.11579778082668782, "rewards/accuracy_reward": 0.0937500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 594.4083557128906, "epoch": 0.44423107697231556, "grad_norm": 0.08325286954641342, "kl": 0.22816858440637589, "learning_rate": 1.3610270989501311e-05, "loss": 0.06, "reward": 1.1005208671092988, "reward_std": 0.16243541650474072, "rewards/accuracy_reward": 0.12500000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208551883697, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 579.3875244140625, "epoch": 0.4445511281805089, "grad_norm": 0.10017146915197372, "kl": 0.32076493874192236, "learning_rate": 1.3599846435455168e-05, "loss": 0.1516, "reward": 1.0677083671092986, "reward_std": 0.17430904135107994, "rewards/accuracy_reward": 0.10833333842456341, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750178813935, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 586.8875183105469, "epoch": 0.4448711793887022, "grad_norm": 0.05737130716443062, "kl": 0.19035155102610588, "learning_rate": 1.358941738503963e-05, "loss": 0.0669, "reward": 1.051562523841858, "reward_std": 0.12057933807373047, "rewards/accuracy_reward": 0.07500000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 589.5666870117187, "epoch": 0.4451912305968955, "grad_norm": 0.20350056886672974, "kl": 0.42712721824645994, "learning_rate": 1.3578983851281036e-05, "loss": 0.0805, "reward": 1.098437523841858, "reward_std": 0.1661032922565937, "rewards/accuracy_reward": 0.12500000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375059604645, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 577.7083557128906, "epoch": 0.4455112818050888, "grad_norm": 0.1656087338924408, "kl": 0.30918216332793236, "learning_rate": 1.3568545847211345e-05, "loss": 0.0968, "reward": 1.0078125178813935, "reward_std": 0.12089228257536888, "rewards/accuracy_reward": 0.03541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958551883698, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 583.4666809082031, "epoch": 0.44583133301328215, "grad_norm": 0.046774476766586304, "kl": 0.19798714965581893, "learning_rate": 1.3558103385868087e-05, "loss": 0.0465, "reward": 1.0338541924953462, "reward_std": 0.0858245899900794, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 562.0250183105469, "epoch": 0.44615138422147543, "grad_norm": 0.06257335096597672, "kl": 0.15758491531014443, "learning_rate": 1.3547656480294365e-05, "loss": 0.0458, "reward": 1.069791704416275, "reward_std": 0.13865265790373088, "rewards/accuracy_reward": 0.08750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916924953461, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 594.7229370117187, "epoch": 0.4464714354296688, "grad_norm": 0.21203400194644928, "kl": 0.16767778843641282, "learning_rate": 1.3537205143538837e-05, "loss": 0.0709, "reward": 1.051562523841858, "reward_std": 0.11704848129302263, "rewards/accuracy_reward": 0.07291666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 585.4479339599609, "epoch": 0.44679148663786206, "grad_norm": 0.09236126393079758, "kl": 0.2601070187985897, "learning_rate": 1.352674938865568e-05, "loss": 0.0594, "reward": 1.0505208551883698, "reward_std": 0.0968201220035553, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 550.2979400634765, "epoch": 0.4471115378460554, "grad_norm": 0.0691533014178276, "kl": 0.251144764572382, "learning_rate": 1.351628922870461e-05, "loss": 0.0553, "reward": 1.0411458671092988, "reward_std": 0.12758275177329778, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958492279053, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 630.1812683105469, "epoch": 0.4474315890542487, "grad_norm": 0.07024051249027252, "kl": 0.21441357135772704, "learning_rate": 1.350582467675083e-05, "loss": 0.053, "reward": 1.0067708492279053, "reward_std": 0.09378877226263285, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708432674408, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 589.5125183105469, "epoch": 0.44775164026244196, "grad_norm": 0.08377721160650253, "kl": 0.2407459184527397, "learning_rate": 1.3495355745865038e-05, "loss": 0.0422, "reward": 1.0979166984558106, "reward_std": 0.16228131018579006, "rewards/accuracy_reward": 0.1166666705161333, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 538.1604248046875, "epoch": 0.4480716914706353, "grad_norm": 0.13796372711658478, "kl": 0.13778001070022583, "learning_rate": 1.348488244912339e-05, "loss": 0.0283, "reward": 1.0557291865348817, "reward_std": 0.13359115049242973, "rewards/accuracy_reward": 0.06875000279396773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791805744171, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 596.5208557128906, "epoch": 0.4483917426788286, "grad_norm": 0.06486453860998154, "kl": 0.22841630578041078, "learning_rate": 1.347440479960751e-05, "loss": 0.0407, "reward": 1.1463542044162751, "reward_std": 0.09497451074421406, "rewards/accuracy_reward": 0.16041667331010104, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 577.2041931152344, "epoch": 0.4487117938870219, "grad_norm": 0.07434765994548798, "kl": 0.3341860793530941, "learning_rate": 1.3463922810404448e-05, "loss": 0.0444, "reward": 0.9833333551883697, "reward_std": 0.08629503026604653, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 598.9000061035156, "epoch": 0.4490318450952152, "grad_norm": 0.11220329999923706, "kl": 0.17129188179969787, "learning_rate": 1.3453436494606683e-05, "loss": 0.0858, "reward": 1.0385416865348815, "reward_std": 0.17846698872745037, "rewards/accuracy_reward": 0.06666666809469461, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 589.8812683105468, "epoch": 0.44935189630340855, "grad_norm": 0.11341900378465652, "kl": 0.2534954246133566, "learning_rate": 1.3442945865312085e-05, "loss": 0.0456, "reward": 1.0838541984558105, "reward_std": 0.15260363109409808, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 598.1354400634766, "epoch": 0.44967194751160183, "grad_norm": 0.07155326008796692, "kl": 0.21175614856183528, "learning_rate": 1.3432450935623922e-05, "loss": 0.0746, "reward": 1.0703125298023224, "reward_std": 0.11777629610151052, "rewards/accuracy_reward": 0.09583333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 613.7104370117188, "epoch": 0.44999199871979517, "grad_norm": 0.06758978217840195, "kl": 0.21431030780076982, "learning_rate": 1.3421951718650836e-05, "loss": 0.0479, "reward": 1.0343750178813935, "reward_std": 0.09610320255160332, "rewards/accuracy_reward": 0.05000000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 592.6521087646485, "epoch": 0.45031204992798846, "grad_norm": 0.06923665851354599, "kl": 0.17125880494713783, "learning_rate": 1.3411448227506815e-05, "loss": 0.0414, "reward": 1.0781250298023224, "reward_std": 0.08269294798374176, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583492279053, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 590.1666839599609, "epoch": 0.4506321011361818, "grad_norm": 0.046295374631881714, "kl": 0.16617081128060818, "learning_rate": 1.3400940475311193e-05, "loss": 0.0337, "reward": 1.1041666984558105, "reward_std": 0.15676663368940352, "rewards/accuracy_reward": 0.12083333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 581.8687744140625, "epoch": 0.4509521523443751, "grad_norm": 0.12262304872274399, "kl": 0.3631344482302666, "learning_rate": 1.3390428475188617e-05, "loss": 0.022, "reward": 1.117187535762787, "reward_std": 0.11911057773977518, "rewards/accuracy_reward": 0.1333333384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 583.472933959961, "epoch": 0.4512722035525684, "grad_norm": 0.06850114464759827, "kl": 0.25028989017009734, "learning_rate": 1.337991224026905e-05, "loss": 0.067, "reward": 1.0416666924953462, "reward_std": 0.09098817594349384, "rewards/accuracy_reward": 0.06458333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 585.7666870117188, "epoch": 0.4515922547607617, "grad_norm": 0.10766538232564926, "kl": 0.2578369677066803, "learning_rate": 1.3369391783687742e-05, "loss": 0.0676, "reward": 1.090625035762787, "reward_std": 0.14667030721902846, "rewards/accuracy_reward": 0.1062500026077032, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9822916865348816, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 564.9416931152343, "epoch": 0.45191230596895504, "grad_norm": 0.07885698229074478, "kl": 0.33700631856918334, "learning_rate": 1.3358867118585212e-05, "loss": 0.0704, "reward": 1.0671875417232513, "reward_std": 0.09663599599152803, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875178813934, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 548.1187683105469, "epoch": 0.4522323571771483, "grad_norm": 0.10163812339305878, "kl": 0.2576158232986927, "learning_rate": 1.3348338258107235e-05, "loss": 0.0956, "reward": 1.1973958730697631, "reward_std": 0.1255306104198098, "rewards/accuracy_reward": 0.21875000596046448, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 611.9375122070312, "epoch": 0.45255240838534166, "grad_norm": 0.09681346267461777, "kl": 0.4438394993543625, "learning_rate": 1.3337805215404837e-05, "loss": 0.101, "reward": 1.0468750417232513, "reward_std": 0.1585536990314722, "rewards/accuracy_reward": 0.08333333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.963541692495346, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 568.3833465576172, "epoch": 0.45287245959353495, "grad_norm": 0.09335222840309143, "kl": 0.3850545734167099, "learning_rate": 1.3327268003634255e-05, "loss": 0.1005, "reward": 1.0765625357627868, "reward_std": 0.17124846372753383, "rewards/accuracy_reward": 0.0958333371207118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 599.4562805175781, "epoch": 0.4531925108017283, "grad_norm": 0.14342984557151794, "kl": 0.3690617233514786, "learning_rate": 1.3316726635956938e-05, "loss": 0.0825, "reward": 1.0729166984558105, "reward_std": 0.09912073966115713, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 605.1208557128906, "epoch": 0.45351256200992157, "grad_norm": 0.09222087264060974, "kl": 0.13021155372262, "learning_rate": 1.3306181125539528e-05, "loss": 0.0387, "reward": 1.0973958551883698, "reward_std": 0.06767892204225064, "rewards/accuracy_reward": 0.10833333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9890625059604645, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 612.4729370117187, "epoch": 0.4538326132181149, "grad_norm": 0.1089828833937645, "kl": 0.32629442512989043, "learning_rate": 1.3295631485553838e-05, "loss": 0.102, "reward": 1.005729192495346, "reward_std": 0.12626549191772937, "rewards/accuracy_reward": 0.03541666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 580.3458557128906, "epoch": 0.4541526644263082, "grad_norm": 0.09334749728441238, "kl": 0.2463594913482666, "learning_rate": 1.3285077729176844e-05, "loss": 0.0788, "reward": 1.0145833551883698, "reward_std": 0.1507036415860057, "rewards/accuracy_reward": 0.0375000013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 583.6541809082031, "epoch": 0.45447271563450153, "grad_norm": 0.18406108021736145, "kl": 0.39999381825327873, "learning_rate": 1.3274519869590656e-05, "loss": 0.0841, "reward": 1.0093750357627869, "reward_std": 0.14787054806947708, "rewards/accuracy_reward": 0.039583333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 600.1104370117188, "epoch": 0.4547927668426948, "grad_norm": 0.16637839376926422, "kl": 0.4374181792140007, "learning_rate": 1.3263957919982516e-05, "loss": 0.0952, "reward": 1.0369791865348816, "reward_std": 0.16850600093603135, "rewards/accuracy_reward": 0.06875000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291746139526, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 592.7020935058594, "epoch": 0.45511281805088816, "grad_norm": 0.11868277937173843, "kl": 0.2869627773761749, "learning_rate": 1.325339189354477e-05, "loss": 0.0445, "reward": 1.0234375298023224, "reward_std": 0.11299450956285, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9755208551883697, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 581.7312683105469, "epoch": 0.45543286925908144, "grad_norm": 0.08685970306396484, "kl": 0.25337800160050394, "learning_rate": 1.3242821803474861e-05, "loss": 0.0851, "reward": 1.0723958492279053, "reward_std": 0.13812698870897294, "rewards/accuracy_reward": 0.1000000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958492279052, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 605.2104278564453, "epoch": 0.4557529204672748, "grad_norm": 0.10650160908699036, "kl": 0.44310767501592635, "learning_rate": 1.3232247662975304e-05, "loss": 0.1005, "reward": 1.054166704416275, "reward_std": 0.17150254175066948, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 619.7708557128906, "epoch": 0.45607297167546806, "grad_norm": 0.0737314224243164, "kl": 0.26119700372219085, "learning_rate": 1.3221669485253672e-05, "loss": 0.0651, "reward": 1.0958333730697631, "reward_std": 0.09929907992482186, "rewards/accuracy_reward": 0.11666667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666865348816, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 575.039599609375, "epoch": 0.4563930228836614, "grad_norm": 0.23615843057632446, "kl": 0.5649647109210492, "learning_rate": 1.3211087283522586e-05, "loss": 0.1103, "reward": 1.0640625298023223, "reward_std": 0.1466512806713581, "rewards/accuracy_reward": 0.09791666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 573.2666870117188, "epoch": 0.4567130740918547, "grad_norm": 0.08350611478090286, "kl": 0.23456739112734795, "learning_rate": 1.3200501070999687e-05, "loss": 0.0658, "reward": 1.024479192495346, "reward_std": 0.11737537570297718, "rewards/accuracy_reward": 0.04583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 580.1104370117188, "epoch": 0.457033125300048, "grad_norm": 0.11149536818265915, "kl": 0.24888658449053763, "learning_rate": 1.3189910860907631e-05, "loss": 0.0654, "reward": 1.0437500476837158, "reward_std": 0.13792316131293775, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666865348816, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 608.9500122070312, "epoch": 0.4573531765082413, "grad_norm": 0.08928157389163971, "kl": 0.44104146808385847, "learning_rate": 1.3179316666474063e-05, "loss": 0.0931, "reward": 1.069791704416275, "reward_std": 0.16804887503385543, "rewards/accuracy_reward": 0.10416666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250178813934, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 567.0541839599609, "epoch": 0.45767322771643465, "grad_norm": 0.0916551873087883, "kl": 0.4219443365931511, "learning_rate": 1.3168718500931603e-05, "loss": 0.0807, "reward": 1.061979204416275, "reward_std": 0.1655805967748165, "rewards/accuracy_reward": 0.0958333345130086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458551883697, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 601.0854370117188, "epoch": 0.45799327892462793, "grad_norm": 0.13563001155853271, "kl": 0.29530239701271055, "learning_rate": 1.315811637751784e-05, "loss": 0.0605, "reward": 1.0119791805744172, "reward_std": 0.07693684957921505, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 627.0708557128906, "epoch": 0.4583133301328213, "grad_norm": 0.13829165697097778, "kl": 0.6939724013209343, "learning_rate": 1.3147510309475301e-05, "loss": 0.0984, "reward": 1.0390625238418578, "reward_std": 0.14951678588986397, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.951562511920929, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 613.764599609375, "epoch": 0.45863338134101456, "grad_norm": 0.07941343635320663, "kl": 0.4705329492688179, "learning_rate": 1.3136900310051438e-05, "loss": 0.1283, "reward": 1.0531250178813933, "reward_std": 0.19366805925965308, "rewards/accuracy_reward": 0.10416666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9489583492279052, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 582.504183959961, "epoch": 0.4589534325492079, "grad_norm": 0.14773200452327728, "kl": 0.30267433300614355, "learning_rate": 1.312628639249861e-05, "loss": 0.0869, "reward": 1.1286458611488341, "reward_std": 0.14947104826569557, "rewards/accuracy_reward": 0.15833333935588598, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 606.289599609375, "epoch": 0.4592734837574012, "grad_norm": 0.06837635487318039, "kl": 0.2559683620929718, "learning_rate": 1.3115668570074083e-05, "loss": 0.0478, "reward": 1.1395833671092988, "reward_std": 0.15164603665471077, "rewards/accuracy_reward": 0.17291667219251394, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666865348816, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 593.6312744140625, "epoch": 0.4595935349655945, "grad_norm": 0.12697356939315796, "kl": 0.35023130998015406, "learning_rate": 1.3105046856039994e-05, "loss": 0.0754, "reward": 1.110416692495346, "reward_std": 0.11461557075381279, "rewards/accuracy_reward": 0.13750000707805157, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166746139526, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 611.8250244140625, "epoch": 0.4599135861737878, "grad_norm": 0.08488718420267105, "kl": 0.3687517575919628, "learning_rate": 1.309442126366333e-05, "loss": 0.0785, "reward": 1.0333333492279053, "reward_std": 0.14628687109798194, "rewards/accuracy_reward": 0.06250000018626452, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333492279053, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 623.1646026611328, "epoch": 0.46023363738198114, "grad_norm": 0.15668301284313202, "kl": 0.31853192709386347, "learning_rate": 1.308379180621594e-05, "loss": 0.0935, "reward": 1.071354204416275, "reward_std": 0.15687522031366824, "rewards/accuracy_reward": 0.10208333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708551883698, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 592.6916931152343, "epoch": 0.4605536885901744, "grad_norm": 0.10902131348848343, "kl": 0.34631902873516085, "learning_rate": 1.3073158496974487e-05, "loss": 0.0796, "reward": 1.0937500238418578, "reward_std": 0.1236114427447319, "rewards/accuracy_reward": 0.12083333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 607.9875244140625, "epoch": 0.46087373979836777, "grad_norm": 0.0653231143951416, "kl": 0.14004647061228753, "learning_rate": 1.3062521349220459e-05, "loss": 0.0568, "reward": 1.0395833611488343, "reward_std": 0.13148568905889987, "rewards/accuracy_reward": 0.05416666846722364, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166805744171, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 599.5166931152344, "epoch": 0.46119379100656105, "grad_norm": 0.2973286509513855, "kl": 0.2680055730044842, "learning_rate": 1.3051880376240117e-05, "loss": 0.1017, "reward": 1.0463541984558105, "reward_std": 0.15086615979671478, "rewards/accuracy_reward": 0.07916666828095913, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9651041746139526, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 605.0729370117188, "epoch": 0.4615138422147544, "grad_norm": 0.10781467705965042, "kl": 0.46533993929624556, "learning_rate": 1.3041235591324521e-05, "loss": 0.1004, "reward": 1.0479166865348817, "reward_std": 0.1986624613404274, "rewards/accuracy_reward": 0.08750000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166805744171, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 595.733349609375, "epoch": 0.46183389342294767, "grad_norm": 0.08439289033412933, "kl": 0.14252968803048133, "learning_rate": 1.3030587007769486e-05, "loss": 0.0462, "reward": 1.0255208551883697, "reward_std": 0.10829742904752493, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 594.30419921875, "epoch": 0.46215394463114096, "grad_norm": 0.13936440646648407, "kl": 0.2732355587184429, "learning_rate": 1.3019934638875565e-05, "loss": 0.0673, "reward": 1.0255208432674408, "reward_std": 0.13787070866674184, "rewards/accuracy_reward": 0.05000000055879354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208492279053, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 593.7437622070313, "epoch": 0.4624739958393343, "grad_norm": 0.1179627776145935, "kl": 0.3253137730062008, "learning_rate": 1.3009278497948046e-05, "loss": 0.0835, "reward": 1.054687535762787, "reward_std": 0.13018050529062747, "rewards/accuracy_reward": 0.08750000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 596.2541809082031, "epoch": 0.4627940470475276, "grad_norm": 0.07859959453344345, "kl": 0.25831368714571, "learning_rate": 1.2998618598296922e-05, "loss": 0.1257, "reward": 0.9828125298023224, "reward_std": 0.14702120125293733, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791805744171, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 608.6625183105468, "epoch": 0.4631140982557209, "grad_norm": 0.20410098135471344, "kl": 0.2851632609963417, "learning_rate": 1.298795495323689e-05, "loss": 0.0749, "reward": 1.0578125238418579, "reward_std": 0.11233447343111039, "rewards/accuracy_reward": 0.08750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 604.816683959961, "epoch": 0.4634341494639142, "grad_norm": 0.10286929458379745, "kl": 0.39714363887906073, "learning_rate": 1.297728757608732e-05, "loss": 0.0495, "reward": 1.0468750298023224, "reward_std": 0.10802833493798972, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 606.7479370117187, "epoch": 0.46375420067210754, "grad_norm": 0.19550444185733795, "kl": 0.3406421348452568, "learning_rate": 1.2966616480172243e-05, "loss": 0.1146, "reward": 1.0473958611488343, "reward_std": 0.1252935364842415, "rewards/accuracy_reward": 0.0812500011175871, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 584.3687683105469, "epoch": 0.4640742518803008, "grad_norm": 0.2726002335548401, "kl": 0.23093743473291398, "learning_rate": 1.2955941678820332e-05, "loss": 0.04, "reward": 1.0791666984558106, "reward_std": 0.13660317473113537, "rewards/accuracy_reward": 0.09791666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 621.6291931152343, "epoch": 0.46439430308849416, "grad_norm": 0.12068649381399155, "kl": 0.3264674745500088, "learning_rate": 1.2945263185364895e-05, "loss": 0.0531, "reward": 1.0875000298023223, "reward_std": 0.1675384446978569, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 608.3708435058594, "epoch": 0.46471435429668745, "grad_norm": 0.14527225494384766, "kl": 0.2746707245707512, "learning_rate": 1.293458101314385e-05, "loss": 0.0709, "reward": 0.9901041924953461, "reward_std": 0.11297452226281166, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541865348816, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 607.8437622070312, "epoch": 0.4650344055048808, "grad_norm": 0.12880566716194153, "kl": 0.31128202080726625, "learning_rate": 1.292389517549971e-05, "loss": 0.1268, "reward": 1.0010416805744171, "reward_std": 0.1498323068022728, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9531250178813935, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 614.5083618164062, "epoch": 0.46535445671307407, "grad_norm": 0.13385367393493652, "kl": 0.44625467509031297, "learning_rate": 1.2913205685779557e-05, "loss": 0.0819, "reward": 1.1098958611488343, "reward_std": 0.15377984158694744, "rewards/accuracy_reward": 0.14583333991467953, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9619791865348816, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 576.0291870117187, "epoch": 0.4656745079212674, "grad_norm": 0.10405029356479645, "kl": 0.2602963488548994, "learning_rate": 1.2902512557335047e-05, "loss": 0.0737, "reward": 0.9890625178813934, "reward_std": 0.1055045148357749, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 600.4521118164063, "epoch": 0.4659945591294607, "grad_norm": 0.07072511315345764, "kl": 0.3464597135782242, "learning_rate": 1.2891815803522378e-05, "loss": 0.0692, "reward": 1.0333333611488342, "reward_std": 0.16075058728456498, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666865348816, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 615.8562652587891, "epoch": 0.46631461033765403, "grad_norm": 0.13768284022808075, "kl": 0.3706891119480133, "learning_rate": 1.2881115437702274e-05, "loss": 0.0992, "reward": 1.0208333611488343, "reward_std": 0.1893759747967124, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9500000178813934, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 605.783349609375, "epoch": 0.4666346615458473, "grad_norm": 0.18860451877117157, "kl": 0.361300827562809, "learning_rate": 1.287041147323997e-05, "loss": 0.1191, "reward": 1.023437535762787, "reward_std": 0.1736320335417986, "rewards/accuracy_reward": 0.060416667349636556, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208551883698, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 585.2229431152343, "epoch": 0.46695471275404066, "grad_norm": 0.10289126634597778, "kl": 0.2510980851948261, "learning_rate": 1.2859703923505194e-05, "loss": 0.1045, "reward": 1.054687535762787, "reward_std": 0.16347628384828566, "rewards/accuracy_reward": 0.08333333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541984558105, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 615.0083618164062, "epoch": 0.46727476396223394, "grad_norm": 0.20354001224040985, "kl": 0.6329892605543137, "learning_rate": 1.2848992801872159e-05, "loss": 0.1397, "reward": 0.9583333551883697, "reward_std": 0.1352860927581787, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9520833551883697, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 592.6750305175781, "epoch": 0.4675948151704273, "grad_norm": 0.08727524429559708, "kl": 0.30383690968155863, "learning_rate": 1.2838278121719536e-05, "loss": 0.0646, "reward": 1.0593750298023223, "reward_std": 0.12578080594539642, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 588.2791870117187, "epoch": 0.46791486637862056, "grad_norm": 0.16131596267223358, "kl": 0.31957222819328307, "learning_rate": 1.2827559896430437e-05, "loss": 0.109, "reward": 1.0489583671092988, "reward_std": 0.13771594911813737, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250238418579, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 567.7104370117188, "epoch": 0.4682349175868139, "grad_norm": 0.1428447663784027, "kl": 0.3186722435057163, "learning_rate": 1.2816838139392407e-05, "loss": 0.0811, "reward": 1.0541666924953461, "reward_std": 0.13920316584408282, "rewards/accuracy_reward": 0.08125000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166805744172, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 586.6666809082031, "epoch": 0.4685549687950072, "grad_norm": 0.13763944804668427, "kl": 0.2436652660369873, "learning_rate": 1.2806112863997401e-05, "loss": 0.0541, "reward": 1.1296875298023223, "reward_std": 0.1380587823688984, "rewards/accuracy_reward": 0.15416667181998492, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 567.4270935058594, "epoch": 0.4688750200032005, "grad_norm": 0.10999301820993423, "kl": 0.24977174699306487, "learning_rate": 1.279538408364177e-05, "loss": 0.0907, "reward": 1.0541666984558105, "reward_std": 0.11675290018320084, "rewards/accuracy_reward": 0.07500000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 593.502099609375, "epoch": 0.4691950712113938, "grad_norm": 0.1865115612745285, "kl": 0.29081103280186654, "learning_rate": 1.2784651811726238e-05, "loss": 0.0705, "reward": 1.0786458551883698, "reward_std": 0.10002645067870616, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 573.252099609375, "epoch": 0.46951512241958715, "grad_norm": 0.215255007147789, "kl": 0.4380812518298626, "learning_rate": 1.2773916061655893e-05, "loss": 0.1143, "reward": 1.0145833611488342, "reward_std": 0.17872334346175195, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9604166865348815, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 573.3271026611328, "epoch": 0.46983517362778043, "grad_norm": 0.11711432039737701, "kl": 0.3209188118577003, "learning_rate": 1.276317684684017e-05, "loss": 0.0772, "reward": 1.0890625238418579, "reward_std": 0.0958840724080801, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 551.5458526611328, "epoch": 0.4701552248359738, "grad_norm": 0.16603219509124756, "kl": 0.3087894439697266, "learning_rate": 1.275243418069283e-05, "loss": 0.077, "reward": 1.1411458611488343, "reward_std": 0.15554081853479146, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 613.839599609375, "epoch": 0.47047527604416706, "grad_norm": 0.1444757878780365, "kl": 0.5770320266485214, "learning_rate": 1.2741688076631942e-05, "loss": 0.1205, "reward": 1.0265625178813935, "reward_std": 0.16366661600768567, "rewards/accuracy_reward": 0.07083333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.955729192495346, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 576.1187683105469, "epoch": 0.4707953272523604, "grad_norm": 0.10163528472185135, "kl": 0.3341947510838509, "learning_rate": 1.2730938548079873e-05, "loss": 0.0968, "reward": 1.0072916924953461, "reward_std": 0.13251439444720745, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.965625011920929, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 568.0416839599609, "epoch": 0.4711153784605537, "grad_norm": 0.24541088938713074, "kl": 0.4345988750457764, "learning_rate": 1.2720185608463258e-05, "loss": 0.1186, "reward": 1.1192708611488342, "reward_std": 0.18835799023509026, "rewards/accuracy_reward": 0.14583333879709243, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375238418579, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 554.9062652587891, "epoch": 0.471435429668747, "grad_norm": 0.26369932293891907, "kl": 0.33843834325671196, "learning_rate": 1.2709429271213009e-05, "loss": 0.1012, "reward": 1.0208333492279054, "reward_std": 0.12742016687989235, "rewards/accuracy_reward": 0.04791666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166865348816, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 587.9416809082031, "epoch": 0.4717554808769403, "grad_norm": 0.10718197375535965, "kl": 0.31209646463394164, "learning_rate": 1.2698669549764272e-05, "loss": 0.0915, "reward": 1.0500000178813935, "reward_std": 0.15133947264403105, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500178813935, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 554.1021026611328, "epoch": 0.47207553208513364, "grad_norm": 0.08452266454696655, "kl": 0.3574494168162346, "learning_rate": 1.2687906457556416e-05, "loss": 0.1113, "reward": 1.0921875417232514, "reward_std": 0.13789307028055192, "rewards/accuracy_reward": 0.11666667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208611488342, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 557.202099609375, "epoch": 0.4723955832933269, "grad_norm": 0.2100512832403183, "kl": 0.35172075033187866, "learning_rate": 1.267714000803303e-05, "loss": 0.0955, "reward": 1.0885416865348816, "reward_std": 0.12355173248797655, "rewards/accuracy_reward": 0.11458333674818277, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 562.2187713623047, "epoch": 0.47271563450152027, "grad_norm": 0.07331164926290512, "kl": 0.2190377414226532, "learning_rate": 1.266637021464189e-05, "loss": 0.0696, "reward": 1.0927083611488342, "reward_std": 0.07819271050393581, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 548.1166839599609, "epoch": 0.47303568570971355, "grad_norm": 0.06377699971199036, "kl": 0.2705439478158951, "learning_rate": 1.265559709083495e-05, "loss": 0.0505, "reward": 1.0239583432674408, "reward_std": 0.10356269031763077, "rewards/accuracy_reward": 0.039583335444331166, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9822916746139526, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 553.3437683105469, "epoch": 0.4733557369179069, "grad_norm": 0.08778975158929825, "kl": 0.23727463632822038, "learning_rate": 1.2644820650068323e-05, "loss": 0.0854, "reward": 1.1609375298023223, "reward_std": 0.09437338933348656, "rewards/accuracy_reward": 0.17708333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 559.3250183105469, "epoch": 0.4736757881261002, "grad_norm": 0.07925225049257278, "kl": 0.2749205954372883, "learning_rate": 1.2634040905802267e-05, "loss": 0.0885, "reward": 1.097916704416275, "reward_std": 0.12705145999789239, "rewards/accuracy_reward": 0.1187500013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 549.7979339599609, "epoch": 0.4739958393342935, "grad_norm": 0.06212180107831955, "kl": 0.12944966927170753, "learning_rate": 1.2623257871501165e-05, "loss": 0.0504, "reward": 1.1041666984558105, "reward_std": 0.08514518775045872, "rewards/accuracy_reward": 0.11458333693444729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833432674408, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 542.9333557128906, "epoch": 0.4743158905424868, "grad_norm": 0.07425817102193832, "kl": 0.21376031935214995, "learning_rate": 1.2612471560633512e-05, "loss": 0.0614, "reward": 1.0046875119209289, "reward_std": 0.06692595425993204, "rewards/accuracy_reward": 0.01458333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041805744171, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 549.8854400634766, "epoch": 0.47463594175068013, "grad_norm": 0.053915202617645264, "kl": 0.1447036750614643, "learning_rate": 1.260168198667189e-05, "loss": 0.045, "reward": 1.1250000298023224, "reward_std": 0.1259409360587597, "rewards/accuracy_reward": 0.13958333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166865348816, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 540.1687652587891, "epoch": 0.4749559929588734, "grad_norm": 0.19801031053066254, "kl": 0.18226547986268998, "learning_rate": 1.2590889163092963e-05, "loss": 0.0328, "reward": 1.0447916865348816, "reward_std": 0.09412735253572464, "rewards/accuracy_reward": 0.05625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416746139526, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 540.0125183105469, "epoch": 0.47527604416706676, "grad_norm": 0.1381172090768814, "kl": 0.3441140428185463, "learning_rate": 1.2580093103377446e-05, "loss": 0.0378, "reward": 1.113541692495346, "reward_std": 0.10265696365386248, "rewards/accuracy_reward": 0.1270833384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 553.4166870117188, "epoch": 0.47559609537526004, "grad_norm": 0.05816073715686798, "kl": 0.18868185505270957, "learning_rate": 1.2569293821010109e-05, "loss": 0.0696, "reward": 1.0906250298023223, "reward_std": 0.08026152718812227, "rewards/accuracy_reward": 0.10208333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416805744172, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 556.7750213623046, "epoch": 0.4759161465834533, "grad_norm": 0.06502912193536758, "kl": 0.13506832644343375, "learning_rate": 1.2558491329479732e-05, "loss": 0.0541, "reward": 1.0291667044162751, "reward_std": 0.08429014421999455, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333551883697, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 571.0937683105469, "epoch": 0.47623619779164666, "grad_norm": 0.04643028974533081, "kl": 0.10432569906115532, "learning_rate": 1.2547685642279113e-05, "loss": 0.0304, "reward": 1.1744791984558105, "reward_std": 0.09066424928605557, "rewards/accuracy_reward": 0.1833333380520344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458373069764, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 582.2187744140625, "epoch": 0.47655624899983995, "grad_norm": 0.055091604590415955, "kl": 0.18568008467555047, "learning_rate": 1.253687677290504e-05, "loss": 0.052, "reward": 1.0333333492279053, "reward_std": 0.08247921578586101, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 585.9479370117188, "epoch": 0.4768763002080333, "grad_norm": 0.11636830866336823, "kl": 0.17313535772264005, "learning_rate": 1.2526064734858277e-05, "loss": 0.0694, "reward": 1.1880208730697632, "reward_std": 0.1414164997637272, "rewards/accuracy_reward": 0.21250000800937413, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 566.087515258789, "epoch": 0.47719635141622657, "grad_norm": 0.04212115705013275, "kl": 0.13310096897184848, "learning_rate": 1.2515249541643537e-05, "loss": 0.0488, "reward": 1.0312500178813935, "reward_std": 0.06270579267293215, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833492279053, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 594.9500305175782, "epoch": 0.4775164026244199, "grad_norm": 0.0569349005818367, "kl": 0.1305923953652382, "learning_rate": 1.2504431206769487e-05, "loss": 0.0547, "reward": 1.0276041746139526, "reward_std": 0.09775371477007866, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 599.3604431152344, "epoch": 0.4778364538326132, "grad_norm": 0.09578822553157806, "kl": 0.1873940646648407, "learning_rate": 1.2493609743748709e-05, "loss": 0.0518, "reward": 1.079166692495346, "reward_std": 0.1045138731598854, "rewards/accuracy_reward": 0.09583333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 592.6750244140625, "epoch": 0.47815650504080653, "grad_norm": 0.05655219033360481, "kl": 0.1329231120646, "learning_rate": 1.2482785166097697e-05, "loss": 0.0415, "reward": 1.0520833671092986, "reward_std": 0.11519988998770714, "rewards/accuracy_reward": 0.06875000111758708, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333551883697, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 590.6271118164062, "epoch": 0.4784765562489998, "grad_norm": 0.1878066509962082, "kl": 0.2167329777032137, "learning_rate": 1.247195748733683e-05, "loss": 0.0675, "reward": 1.0916666984558105, "reward_std": 0.08340043239295483, "rewards/accuracy_reward": 0.11041666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 573.614599609375, "epoch": 0.47879660745719316, "grad_norm": 0.06385314464569092, "kl": 0.18466985821723939, "learning_rate": 1.2461126720990367e-05, "loss": 0.0717, "reward": 1.004687523841858, "reward_std": 0.08886413350701332, "rewards/accuracy_reward": 0.0229166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708551883697, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 605.7083435058594, "epoch": 0.47911665866538644, "grad_norm": 0.0636759102344513, "kl": 0.12463123425841331, "learning_rate": 1.2450292880586414e-05, "loss": 0.0456, "reward": 1.0427083551883698, "reward_std": 0.05680535212159157, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 588.3458618164062, "epoch": 0.4794367098735798, "grad_norm": 0.1012827605009079, "kl": 0.1598178006708622, "learning_rate": 1.2439455979656931e-05, "loss": 0.0343, "reward": 1.0802083551883697, "reward_std": 0.0793739415705204, "rewards/accuracy_reward": 0.08958333488553763, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9885416805744172, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 581.6041900634766, "epoch": 0.47975676108177306, "grad_norm": 0.15515443682670593, "kl": 0.21187707930803298, "learning_rate": 1.2428616031737688e-05, "loss": 0.0709, "reward": 1.1161458551883698, "reward_std": 0.12064327895641327, "rewards/accuracy_reward": 0.13333333879709244, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 594.1687683105469, "epoch": 0.4800768122899664, "grad_norm": 0.10084983706474304, "kl": 0.128275853022933, "learning_rate": 1.241777305036827e-05, "loss": 0.0384, "reward": 1.0453125178813933, "reward_std": 0.12377176433801651, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125178813935, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 573.1291870117187, "epoch": 0.4803968634981597, "grad_norm": 0.09301966428756714, "kl": 0.18412938639521598, "learning_rate": 1.2406927049092034e-05, "loss": 0.0557, "reward": 1.0333333611488342, "reward_std": 0.08592780809849501, "rewards/accuracy_reward": 0.05416666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666865348816, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 563.5791870117188, "epoch": 0.480716914706353, "grad_norm": 0.1392291635274887, "kl": 0.31433880925178526, "learning_rate": 1.2396078041456137e-05, "loss": 0.1133, "reward": 0.9890625298023223, "reward_std": 0.12419578358530999, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791924953461, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 574.0854278564453, "epoch": 0.4810369659145463, "grad_norm": 0.21207989752292633, "kl": 0.3643794015049934, "learning_rate": 1.2385226041011464e-05, "loss": 0.0725, "reward": 1.076562523841858, "reward_std": 0.13115446493029595, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 605.4500152587891, "epoch": 0.48135701712273965, "grad_norm": 0.10227753221988678, "kl": 0.2762680515646935, "learning_rate": 1.2374371061312655e-05, "loss": 0.0846, "reward": 1.1239583671092988, "reward_std": 0.1950426936149597, "rewards/accuracy_reward": 0.14375000596046447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083551883698, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 622.4208435058594, "epoch": 0.48167706833093293, "grad_norm": 0.08751754462718964, "kl": 0.33224751353263854, "learning_rate": 1.2363513115918065e-05, "loss": 0.072, "reward": 1.0447916984558105, "reward_std": 0.11386512406170368, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583551883697, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 593.8750183105469, "epoch": 0.4819971195391263, "grad_norm": 0.16370034217834473, "kl": 0.20460733622312546, "learning_rate": 1.2352652218389754e-05, "loss": 0.0757, "reward": 1.0338541746139527, "reward_std": 0.1230011885985732, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041746139527, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 597.0771026611328, "epoch": 0.48231717074731956, "grad_norm": 0.07614739239215851, "kl": 0.2973733879625797, "learning_rate": 1.2341788382293467e-05, "loss": 0.0886, "reward": 1.0916666984558105, "reward_std": 0.13258982375264167, "rewards/accuracy_reward": 0.11458333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 589.1729370117188, "epoch": 0.4826372219555129, "grad_norm": 0.34557607769966125, "kl": 0.2178966648876667, "learning_rate": 1.2330921621198624e-05, "loss": 0.0636, "reward": 1.0552083671092987, "reward_std": 0.11473358049988747, "rewards/accuracy_reward": 0.07500000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 584.839599609375, "epoch": 0.4829572731637062, "grad_norm": 0.3721748888492584, "kl": 0.33902390524744985, "learning_rate": 1.2320051948678295e-05, "loss": 0.0928, "reward": 1.1062500298023223, "reward_std": 0.14177928101271392, "rewards/accuracy_reward": 0.13333333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166924953461, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 595.5125122070312, "epoch": 0.4832773243718995, "grad_norm": 0.10752350836992264, "kl": 0.30240178406238555, "learning_rate": 1.2309179378309188e-05, "loss": 0.1099, "reward": 1.0609375357627868, "reward_std": 0.15852489322423935, "rewards/accuracy_reward": 0.09791667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208551883698, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 605.4062683105469, "epoch": 0.4835973755800928, "grad_norm": 0.10335452109575272, "kl": 0.34756200537085535, "learning_rate": 1.2298303923671635e-05, "loss": 0.0878, "reward": 1.1312500417232514, "reward_std": 0.14642674401402472, "rewards/accuracy_reward": 0.16041667088866235, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333551883698, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 584.2604431152344, "epoch": 0.48391742678828614, "grad_norm": 0.09037502855062485, "kl": 0.2697148099541664, "learning_rate": 1.2287425598349558e-05, "loss": 0.0704, "reward": 1.0531250357627868, "reward_std": 0.15801854096353055, "rewards/accuracy_reward": 0.08125000447034836, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750178813934, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 613.7021057128907, "epoch": 0.4842374779964794, "grad_norm": 0.10988093167543411, "kl": 0.41062536016106604, "learning_rate": 1.2276544415930476e-05, "loss": 0.0912, "reward": 1.0375000417232514, "reward_std": 0.2113563533872366, "rewards/accuracy_reward": 0.0770833371207118, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9583333492279053, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 603.2291931152344, "epoch": 0.48455752920467277, "grad_norm": 0.12491331994533539, "kl": 0.4697680056095123, "learning_rate": 1.2265660390005474e-05, "loss": 0.0787, "reward": 1.0557291805744171, "reward_std": 0.13224833961576224, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 556.4583618164063, "epoch": 0.48487758041286605, "grad_norm": 0.20729385316371918, "kl": 0.41887201070785524, "learning_rate": 1.2254773534169188e-05, "loss": 0.0538, "reward": 1.1151042103767395, "reward_std": 0.10528469458222389, "rewards/accuracy_reward": 0.1458333373069763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708551883698, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 579.0312744140625, "epoch": 0.4851976316210594, "grad_norm": 0.12627162039279938, "kl": 0.40239308923482897, "learning_rate": 1.2243883862019787e-05, "loss": 0.1111, "reward": 1.1109375536441803, "reward_std": 0.18767590299248696, "rewards/accuracy_reward": 0.14375000558793544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 605.4666809082031, "epoch": 0.4855176828292527, "grad_norm": 0.16976335644721985, "kl": 0.4639628753066063, "learning_rate": 1.2232991387158957e-05, "loss": 0.0657, "reward": 1.0390625298023224, "reward_std": 0.1840406185016036, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125119209289, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 601.6625244140625, "epoch": 0.485837734037446, "grad_norm": 0.2504444718360901, "kl": 0.4745438635349274, "learning_rate": 1.2222096123191891e-05, "loss": 0.0635, "reward": 1.154166692495346, "reward_std": 0.17704404518008232, "rewards/accuracy_reward": 0.18750000409781933, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666805744171, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 610.3666931152344, "epoch": 0.4861577852456393, "grad_norm": 0.15202420949935913, "kl": 0.6530636228621006, "learning_rate": 1.2211198083727262e-05, "loss": 0.11, "reward": 1.0015625178813934, "reward_std": 0.1717325121164322, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9536458492279053, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 583.3354370117188, "epoch": 0.48647783645383263, "grad_norm": 0.20241940021514893, "kl": 0.46638872250914576, "learning_rate": 1.2200297282377207e-05, "loss": 0.0907, "reward": 1.0963541984558105, "reward_std": 0.13941559456288816, "rewards/accuracy_reward": 0.1270833384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9692708492279053, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 586.6875244140625, "epoch": 0.4867978876620259, "grad_norm": 0.1839069426059723, "kl": 0.36424013748764994, "learning_rate": 1.2189393732757313e-05, "loss": 0.0786, "reward": 1.035416704416275, "reward_std": 0.10752957388758659, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 552.3687622070313, "epoch": 0.48711793887021926, "grad_norm": 0.3818299174308777, "kl": 0.43634158819913865, "learning_rate": 1.2178487448486607e-05, "loss": 0.0566, "reward": 1.0375000298023225, "reward_std": 0.12123782895505428, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 584.6416931152344, "epoch": 0.48743799007841254, "grad_norm": 0.1835007667541504, "kl": 0.6065421789884567, "learning_rate": 1.2167578443187523e-05, "loss": 0.0932, "reward": 1.1057291746139526, "reward_std": 0.14355219900608063, "rewards/accuracy_reward": 0.13958333637565373, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 591.9229309082032, "epoch": 0.4877580412866059, "grad_norm": 0.23025159537792206, "kl": 0.3891874521970749, "learning_rate": 1.2156666730485895e-05, "loss": 0.0711, "reward": 1.0072916805744172, "reward_std": 0.15119225680828094, "rewards/accuracy_reward": 0.035416667349636555, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 613.2854370117187, "epoch": 0.48807809249479917, "grad_norm": 0.15648485720157623, "kl": 0.45245604068040846, "learning_rate": 1.2145752324010948e-05, "loss": 0.0598, "reward": 1.0708333492279052, "reward_std": 0.10634525269269943, "rewards/accuracy_reward": 0.09583333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 566.4625122070313, "epoch": 0.4883981437029925, "grad_norm": 0.23647770285606384, "kl": 0.31469220519065855, "learning_rate": 1.2134835237395254e-05, "loss": 0.054, "reward": 0.9968750238418579, "reward_std": 0.1189862385392189, "rewards/accuracy_reward": 0.020833333767950536, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 579.8854339599609, "epoch": 0.4887181949111858, "grad_norm": 0.08674870431423187, "kl": 0.32070714607834816, "learning_rate": 1.2123915484274755e-05, "loss": 0.0821, "reward": 1.0953125476837158, "reward_std": 0.1944058895111084, "rewards/accuracy_reward": 0.12083333935588599, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791686534882, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 574.8812805175781, "epoch": 0.4890382461193791, "grad_norm": 0.09347715973854065, "kl": 0.32560470774769784, "learning_rate": 1.2112993078288702e-05, "loss": 0.0686, "reward": 1.021875023841858, "reward_std": 0.12335868813097477, "rewards/accuracy_reward": 0.0458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 586.4937774658204, "epoch": 0.4893582973275724, "grad_norm": 0.17592473328113556, "kl": 0.4667899638414383, "learning_rate": 1.2102068033079672e-05, "loss": 0.1084, "reward": 1.0171875298023223, "reward_std": 0.16079341247677803, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375059604645, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 597.1625183105468, "epoch": 0.4896783485357657, "grad_norm": 0.18604399263858795, "kl": 0.6784585162997245, "learning_rate": 1.2091140362293538e-05, "loss": 0.0743, "reward": 1.1005208611488342, "reward_std": 0.17486817315220832, "rewards/accuracy_reward": 0.1333333384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875238418579, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 583.4166870117188, "epoch": 0.48999839974395903, "grad_norm": 0.24168041348457336, "kl": 0.4977486953139305, "learning_rate": 1.2080210079579452e-05, "loss": 0.094, "reward": 1.052604192495346, "reward_std": 0.1793554574251175, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 585.9062683105469, "epoch": 0.4903184509521523, "grad_norm": 0.27037033438682556, "kl": 0.486817866563797, "learning_rate": 1.2069277198589819e-05, "loss": 0.0892, "reward": 1.1145833730697632, "reward_std": 0.15329679772257804, "rewards/accuracy_reward": 0.14583333786576985, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 600.5521057128906, "epoch": 0.49063850216034566, "grad_norm": 0.12519511580467224, "kl": 0.5029525205492973, "learning_rate": 1.2058341732980303e-05, "loss": 0.0983, "reward": 1.071875023841858, "reward_std": 0.1138947419822216, "rewards/accuracy_reward": 0.10208333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916805744171, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 596.558349609375, "epoch": 0.49095855336853894, "grad_norm": 0.2664872407913208, "kl": 0.3999428883194923, "learning_rate": 1.2047403696409787e-05, "loss": 0.0554, "reward": 1.012500023841858, "reward_std": 0.09246203228831291, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166746139526, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 556.7562683105468, "epoch": 0.4912786045767323, "grad_norm": 0.21524342894554138, "kl": 0.5321291498839855, "learning_rate": 1.2036463102540375e-05, "loss": 0.0624, "reward": 1.0281250119209289, "reward_std": 0.20354544073343278, "rewards/accuracy_reward": 0.06250000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9656250178813934, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 577.5437713623047, "epoch": 0.49159865578492556, "grad_norm": 0.1489776223897934, "kl": 0.42065939456224444, "learning_rate": 1.202551996503735e-05, "loss": 0.1074, "reward": 1.012500023841858, "reward_std": 0.11747174710035324, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9666666865348816, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 554.7625213623047, "epoch": 0.4919187069931189, "grad_norm": 0.12150443345308304, "kl": 0.27668090388178823, "learning_rate": 1.2014574297569182e-05, "loss": 0.044, "reward": 1.1286458551883698, "reward_std": 0.08452764227986335, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791686534881, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 575.3229370117188, "epoch": 0.4922387582013122, "grad_norm": 0.2578086256980896, "kl": 0.4889775365591049, "learning_rate": 1.2003626113807504e-05, "loss": 0.1198, "reward": 1.1427083611488342, "reward_std": 0.1477345634251833, "rewards/accuracy_reward": 0.17291667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969791692495346, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 571.3729370117187, "epoch": 0.4925588094095055, "grad_norm": 0.2890605926513672, "kl": 0.750163146853447, "learning_rate": 1.1992675427427085e-05, "loss": 0.0899, "reward": 1.0348958611488341, "reward_std": 0.18009860459715127, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 600.2646026611328, "epoch": 0.4928788606176988, "grad_norm": 0.2128416746854782, "kl": 0.7321541458368301, "learning_rate": 1.1981722252105827e-05, "loss": 0.0875, "reward": 0.9989583432674408, "reward_std": 0.15414710491895675, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9614583432674408, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 570.1687774658203, "epoch": 0.49319891182589215, "grad_norm": 0.31140488386154175, "kl": 0.5592325419187546, "learning_rate": 1.1970766601524733e-05, "loss": 0.1257, "reward": 1.0786458671092987, "reward_std": 0.20177911669015886, "rewards/accuracy_reward": 0.11250000279396773, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458492279053, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 581.1750305175781, "epoch": 0.49351896303408543, "grad_norm": 0.21769623458385468, "kl": 0.8614889137446881, "learning_rate": 1.1959808489367897e-05, "loss": 0.0884, "reward": 1.0255208432674408, "reward_std": 0.15918941050767899, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9463541805744171, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 580.5604370117187, "epoch": 0.4938390142422788, "grad_norm": 0.15683266520500183, "kl": 0.3458200544118881, "learning_rate": 1.1948847929322498e-05, "loss": 0.0774, "reward": 1.0020833551883697, "reward_std": 0.10447518564760686, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 579.0687683105468, "epoch": 0.49415906545047206, "grad_norm": 0.25657808780670166, "kl": 0.3729611948132515, "learning_rate": 1.1937884935078767e-05, "loss": 0.1044, "reward": 1.0755208551883697, "reward_std": 0.12975474894046785, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 514.5958435058594, "epoch": 0.4944791166586654, "grad_norm": 0.28509220480918884, "kl": 0.39910185933113096, "learning_rate": 1.192691952032997e-05, "loss": 0.0795, "reward": 1.0265625238418579, "reward_std": 0.1085195817053318, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958373069763, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 545.1687774658203, "epoch": 0.4947991678668587, "grad_norm": 0.08284153044223785, "kl": 0.42232955545186995, "learning_rate": 1.1915951698772403e-05, "loss": 0.0637, "reward": 1.0494791924953462, "reward_std": 0.125248346850276, "rewards/accuracy_reward": 0.07083333507180214, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458551883698, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 553.6583587646485, "epoch": 0.495119219075052, "grad_norm": 0.19617708027362823, "kl": 0.3461019277572632, "learning_rate": 1.1904981484105367e-05, "loss": 0.0888, "reward": 1.0942708551883698, "reward_std": 0.12184848748147488, "rewards/accuracy_reward": 0.1187500050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 569.3375061035156, "epoch": 0.4954392702832453, "grad_norm": 0.12780652940273285, "kl": 0.6290895700454712, "learning_rate": 1.1894008890031152e-05, "loss": 0.0819, "reward": 1.0213541865348816, "reward_std": 0.1498611181974411, "rewards/accuracy_reward": 0.06041666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375119209289, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 544.0604309082031, "epoch": 0.49575932149143864, "grad_norm": 0.2656075656414032, "kl": 0.5985722355544567, "learning_rate": 1.1883033930255018e-05, "loss": 0.113, "reward": 1.0385416984558105, "reward_std": 0.13213938027620314, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 547.5437622070312, "epoch": 0.4960793726996319, "grad_norm": 0.10763701051473618, "kl": 0.33446870669722556, "learning_rate": 1.1872056618485183e-05, "loss": 0.1121, "reward": 1.0614583551883698, "reward_std": 0.14568435922265052, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9718750059604645, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 583.8854309082031, "epoch": 0.49639942390782527, "grad_norm": 0.1511547714471817, "kl": 0.42227124869823457, "learning_rate": 1.1861076968432794e-05, "loss": 0.0709, "reward": 1.0765625298023225, "reward_std": 0.12650047056376934, "rewards/accuracy_reward": 0.10625000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9703125178813934, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 561.7541809082031, "epoch": 0.49671947511601855, "grad_norm": 0.10951292514801025, "kl": 0.31010197959840297, "learning_rate": 1.1850094993811936e-05, "loss": 0.0808, "reward": 1.0333333551883697, "reward_std": 0.11681788396090269, "rewards/accuracy_reward": 0.052083333395421504, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 576.702099609375, "epoch": 0.4970395263242119, "grad_norm": 0.1494865119457245, "kl": 0.45758294574916364, "learning_rate": 1.183911070833958e-05, "loss": 0.0535, "reward": 1.0880208551883697, "reward_std": 0.1360863834619522, "rewards/accuracy_reward": 0.10416666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 601.3000122070313, "epoch": 0.4973595775324052, "grad_norm": 0.12859082221984863, "kl": 0.47827325090765954, "learning_rate": 1.1828124125735597e-05, "loss": 0.0762, "reward": 1.0489583492279053, "reward_std": 0.11680370531976222, "rewards/accuracy_reward": 0.06875000111758708, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 568.5396057128906, "epoch": 0.4976796287405985, "grad_norm": 0.14917534589767456, "kl": 0.4069339819252491, "learning_rate": 1.1817135259722707e-05, "loss": 0.0827, "reward": 1.094791704416275, "reward_std": 0.13878009673207997, "rewards/accuracy_reward": 0.11666667181998491, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 565.6979370117188, "epoch": 0.4979996799487918, "grad_norm": 0.20614580810070038, "kl": 0.3559134520590305, "learning_rate": 1.1806144124026514e-05, "loss": 0.0353, "reward": 1.083854180574417, "reward_std": 0.1292146436870098, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541686534882, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 566.9000183105469, "epoch": 0.49831973115698514, "grad_norm": 0.054509177803993225, "kl": 0.16616240218281747, "learning_rate": 1.1795150732375425e-05, "loss": 0.0222, "reward": 1.007812511920929, "reward_std": 0.05190774351358414, "rewards/accuracy_reward": 0.01458333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9932291686534882, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 571.9208435058594, "epoch": 0.4986397823651784, "grad_norm": 0.09618407487869263, "kl": 0.27430230379104614, "learning_rate": 1.1784155098500682e-05, "loss": 0.061, "reward": 1.0598958671092986, "reward_std": 0.0757111854851246, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958551883698, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 559.4854339599609, "epoch": 0.49895983357337176, "grad_norm": 0.2027837336063385, "kl": 0.5071586236357689, "learning_rate": 1.1773157236136328e-05, "loss": 0.0846, "reward": 1.0614583551883698, "reward_std": 0.1345365099608898, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 580.9750244140625, "epoch": 0.49927988478156504, "grad_norm": 0.07568039745092392, "kl": 0.18464777991175652, "learning_rate": 1.1762157159019184e-05, "loss": 0.042, "reward": 0.9994791865348815, "reward_std": 0.0876783674582839, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958373069763, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 559.9687622070312, "epoch": 0.4995999359897584, "grad_norm": 0.09287263453006744, "kl": 0.21261435151100158, "learning_rate": 1.1751154880888835e-05, "loss": 0.034, "reward": 1.0916666865348816, "reward_std": 0.10271332561969757, "rewards/accuracy_reward": 0.10208333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833373069763, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 563.5041931152343, "epoch": 0.49991998719795167, "grad_norm": 0.06782330572605133, "kl": 0.26801488250494004, "learning_rate": 1.1740150415487621e-05, "loss": 0.0638, "reward": 1.053125023841858, "reward_std": 0.1198198726400733, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 575.9812744140625, "epoch": 0.500240038406145, "grad_norm": 0.24701856076717377, "kl": 0.2620903179049492, "learning_rate": 1.1729143776560614e-05, "loss": 0.1063, "reward": 1.0911458671092986, "reward_std": 0.1619997039437294, "rewards/accuracy_reward": 0.1125000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 572.0208526611328, "epoch": 0.5005600896143383, "grad_norm": 0.12949657440185547, "kl": 0.30676123052835463, "learning_rate": 1.17181349778556e-05, "loss": 0.0417, "reward": 1.0942708492279052, "reward_std": 0.16428543999791145, "rewards/accuracy_reward": 0.11666666734963656, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 559.2500152587891, "epoch": 0.5008801408225316, "grad_norm": 0.1947976052761078, "kl": 0.3174207493662834, "learning_rate": 1.1707124033123058e-05, "loss": 0.0828, "reward": 1.0536458611488342, "reward_std": 0.1148330207914114, "rewards/accuracy_reward": 0.07500000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 582.660433959961, "epoch": 0.501200192030725, "grad_norm": 0.13384360074996948, "kl": 0.4696915991604328, "learning_rate": 1.1696110956116151e-05, "loss": 0.1037, "reward": 1.0854166984558105, "reward_std": 0.12102588117122651, "rewards/accuracy_reward": 0.10625000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666746139527, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 549.2395965576172, "epoch": 0.5015202432389182, "grad_norm": 0.38326171040534973, "kl": 0.25607917830348015, "learning_rate": 1.1685095760590706e-05, "loss": 0.0815, "reward": 1.0500000178813935, "reward_std": 0.11502253264188766, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 581.458349609375, "epoch": 0.5018402944471115, "grad_norm": 0.15577267110347748, "kl": 0.39244875088334086, "learning_rate": 1.1674078460305199e-05, "loss": 0.0476, "reward": 1.035416692495346, "reward_std": 0.12932371124625205, "rewards/accuracy_reward": 0.05208333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 595.4021026611329, "epoch": 0.5021603456553049, "grad_norm": 0.20703621208667755, "kl": 0.300426347181201, "learning_rate": 1.1663059069020728e-05, "loss": 0.0965, "reward": 1.112500011920929, "reward_std": 0.15419322103261948, "rewards/accuracy_reward": 0.13541666995733975, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833373069763, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 578.2916900634766, "epoch": 0.5024803968634982, "grad_norm": 0.13127927482128143, "kl": 0.3124852038919926, "learning_rate": 1.1652037600501007e-05, "loss": 0.0743, "reward": 1.0692708611488342, "reward_std": 0.1161904064938426, "rewards/accuracy_reward": 0.08958333879709243, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 603.0646026611328, "epoch": 0.5028004480716914, "grad_norm": 0.24042336642742157, "kl": 0.44360672757029534, "learning_rate": 1.1641014068512342e-05, "loss": 0.071, "reward": 1.0895833849906922, "reward_std": 0.14635562822222709, "rewards/accuracy_reward": 0.11458333879709244, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 595.0375244140625, "epoch": 0.5031204992798848, "grad_norm": 0.17726963758468628, "kl": 0.32658193036913874, "learning_rate": 1.162998848682362e-05, "loss": 0.0587, "reward": 1.1692708730697632, "reward_std": 0.12005004528909921, "rewards/accuracy_reward": 0.1812500072643161, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208492279052, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 548.9583557128906, "epoch": 0.5034405504880781, "grad_norm": 0.38185715675354004, "kl": 0.6799829356372357, "learning_rate": 1.1618960869206287e-05, "loss": 0.118, "reward": 1.0156250178813935, "reward_std": 0.14403234291821718, "rewards/accuracy_reward": 0.0354166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 587.5062744140625, "epoch": 0.5037606016962713, "grad_norm": 0.19469714164733887, "kl": 0.5438534311950207, "learning_rate": 1.1607931229434328e-05, "loss": 0.0845, "reward": 1.0177083671092988, "reward_std": 0.09994984827935696, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416865348815, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 591.4979309082031, "epoch": 0.5040806529044647, "grad_norm": 0.16624751687049866, "kl": 0.49131586998701093, "learning_rate": 1.1596899581284263e-05, "loss": 0.0904, "reward": 1.0052083671092986, "reward_std": 0.1507533010095358, "rewards/accuracy_reward": 0.033333334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.971875011920929, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 547.9333465576171, "epoch": 0.504400704112658, "grad_norm": 0.12965738773345947, "kl": 0.3315769825130701, "learning_rate": 1.1585865938535106e-05, "loss": 0.0851, "reward": 1.079687523841858, "reward_std": 0.10819105207920074, "rewards/accuracy_reward": 0.09791666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708373069763, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 576.7479309082031, "epoch": 0.5047207553208514, "grad_norm": 0.10354313999414444, "kl": 0.501846868172288, "learning_rate": 1.157483031496838e-05, "loss": 0.0522, "reward": 1.1197916865348816, "reward_std": 0.11418646480888128, "rewards/accuracy_reward": 0.13750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916805744171, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 573.2604370117188, "epoch": 0.5050408065290446, "grad_norm": 0.10002946853637695, "kl": 0.256185794621706, "learning_rate": 1.1563792724368066e-05, "loss": 0.0363, "reward": 1.0333333551883697, "reward_std": 0.08332265578210354, "rewards/accuracy_reward": 0.04583333563059568, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987500011920929, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 601.5146057128907, "epoch": 0.5053608577372379, "grad_norm": 0.08440849930047989, "kl": 0.23348399624228477, "learning_rate": 1.1552753180520612e-05, "loss": 0.0526, "reward": 1.0494791924953462, "reward_std": 0.06868757046759129, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291865348816, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 602.3291870117188, "epoch": 0.5056809089454313, "grad_norm": 0.07465245574712753, "kl": 0.2693470485508442, "learning_rate": 1.15417116972149e-05, "loss": 0.057, "reward": 1.0104166924953462, "reward_std": 0.10453488267958164, "rewards/accuracy_reward": 0.025000001303851604, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166805744171, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 611.9937622070313, "epoch": 0.5060009601536246, "grad_norm": 0.12690547108650208, "kl": 0.36593677997589114, "learning_rate": 1.1530668288242244e-05, "loss": 0.1098, "reward": 1.122916692495346, "reward_std": 0.15517406426370145, "rewards/accuracy_reward": 0.1583333384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 600.5250183105469, "epoch": 0.5063210113618178, "grad_norm": 0.08542995899915695, "kl": 0.30837327912449836, "learning_rate": 1.1519622967396347e-05, "loss": 0.0626, "reward": 1.0864583671092987, "reward_std": 0.1340044129639864, "rewards/accuracy_reward": 0.11041667181998491, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 606.783349609375, "epoch": 0.5066410625700112, "grad_norm": 0.12930899858474731, "kl": 0.20259029194712638, "learning_rate": 1.1508575748473317e-05, "loss": 0.07, "reward": 1.027604192495346, "reward_std": 0.10508870705962181, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875178813934, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 581.8916870117188, "epoch": 0.5069611137782045, "grad_norm": 0.12330963462591171, "kl": 0.2539986282587051, "learning_rate": 1.1497526645271618e-05, "loss": 0.0491, "reward": 1.0385416924953461, "reward_std": 0.09695471059530973, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 601.6916809082031, "epoch": 0.5072811649863979, "grad_norm": 0.09102137386798859, "kl": 0.23063104897737502, "learning_rate": 1.1486475671592084e-05, "loss": 0.0721, "reward": 1.0562500298023223, "reward_std": 0.12390441037714481, "rewards/accuracy_reward": 0.07916666846722364, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 580.4791870117188, "epoch": 0.5076012161945911, "grad_norm": 0.10432296246290207, "kl": 0.30286980494856836, "learning_rate": 1.1475422841237867e-05, "loss": 0.0615, "reward": 1.0531250298023225, "reward_std": 0.10416628401726484, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083551883698, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 549.1687744140625, "epoch": 0.5079212674027844, "grad_norm": 0.07721851021051407, "kl": 0.2693323642015457, "learning_rate": 1.146436816801445e-05, "loss": 0.0738, "reward": 1.0328125298023223, "reward_std": 0.13880394026637077, "rewards/accuracy_reward": 0.05208333563059568, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9786458492279053, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 594.4062683105469, "epoch": 0.5082413186109778, "grad_norm": 0.1546679586172104, "kl": 0.18442733883857726, "learning_rate": 1.1453311665729618e-05, "loss": 0.0621, "reward": 1.068229180574417, "reward_std": 0.0874556915834546, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 581.8958557128906, "epoch": 0.5085613698191711, "grad_norm": 0.11382945626974106, "kl": 0.1633252240717411, "learning_rate": 1.1442253348193437e-05, "loss": 0.063, "reward": 1.1130208551883698, "reward_std": 0.10782817732542753, "rewards/accuracy_reward": 0.1270833358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 595.0937652587891, "epoch": 0.5088814210273643, "grad_norm": 0.10278374701738358, "kl": 0.18282031267881393, "learning_rate": 1.1431193229218236e-05, "loss": 0.0389, "reward": 1.1364583492279052, "reward_std": 0.10143474787473679, "rewards/accuracy_reward": 0.1541666707023978, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 586.0812683105469, "epoch": 0.5092014722355577, "grad_norm": 0.13747954368591309, "kl": 0.21457262933254242, "learning_rate": 1.1420131322618601e-05, "loss": 0.0535, "reward": 1.0312500298023224, "reward_std": 0.09525865130126476, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500238418579, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 574.743765258789, "epoch": 0.509521523443751, "grad_norm": 0.06254348903894424, "kl": 0.1685093741863966, "learning_rate": 1.1409067642211352e-05, "loss": 0.0469, "reward": 1.0687500178813933, "reward_std": 0.10046824738383293, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 602.8271057128907, "epoch": 0.5098415746519444, "grad_norm": 0.10820659250020981, "kl": 0.20122895240783692, "learning_rate": 1.1398002201815517e-05, "loss": 0.0486, "reward": 1.007812511920929, "reward_std": 0.0893495699390769, "rewards/accuracy_reward": 0.02291666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 574.4104309082031, "epoch": 0.5101616258601376, "grad_norm": 0.1397324651479721, "kl": 0.22246124893426894, "learning_rate": 1.138693501525233e-05, "loss": 0.0574, "reward": 1.1260416865348817, "reward_std": 0.14539001807570456, "rewards/accuracy_reward": 0.14166667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 596.7250183105468, "epoch": 0.5104816770683309, "grad_norm": 0.08888793736696243, "kl": 0.25225738137960435, "learning_rate": 1.1375866096345201e-05, "loss": 0.0785, "reward": 1.0520833671092986, "reward_std": 0.11396115198731423, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 561.0479309082032, "epoch": 0.5108017282765243, "grad_norm": 0.10985810309648514, "kl": 0.17879950627684593, "learning_rate": 1.1364795458919704e-05, "loss": 0.0433, "reward": 1.083854180574417, "reward_std": 0.0588629137724638, "rewards/accuracy_reward": 0.09375000502914191, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 586.3020935058594, "epoch": 0.5111217794847176, "grad_norm": 0.09722897410392761, "kl": 0.23140017688274384, "learning_rate": 1.135372311680356e-05, "loss": 0.0473, "reward": 1.1265625298023223, "reward_std": 0.13466337826102973, "rewards/accuracy_reward": 0.1395833369344473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791805744171, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 610.4125122070312, "epoch": 0.5114418306929108, "grad_norm": 0.06069159880280495, "kl": 0.19542404487729073, "learning_rate": 1.1342649083826629e-05, "loss": 0.0555, "reward": 1.0776041805744172, "reward_std": 0.08706863857805729, "rewards/accuracy_reward": 0.09583333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 576.1021026611328, "epoch": 0.5117618819011042, "grad_norm": 0.04462220519781113, "kl": 0.16532036811113357, "learning_rate": 1.1331573373820864e-05, "loss": 0.07, "reward": 1.0604166865348816, "reward_std": 0.08268660437315703, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 611.520849609375, "epoch": 0.5120819331092975, "grad_norm": 0.16866251826286316, "kl": 0.19539010524749756, "learning_rate": 1.1320496000620325e-05, "loss": 0.0499, "reward": 1.0390625417232513, "reward_std": 0.11077856961637736, "rewards/accuracy_reward": 0.05416666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 571.6520965576171, "epoch": 0.5124019843174908, "grad_norm": 0.07243788987398148, "kl": 0.16208918057382107, "learning_rate": 1.1309416978061149e-05, "loss": 0.0587, "reward": 1.0708333492279052, "reward_std": 0.10626544300466775, "rewards/accuracy_reward": 0.08541666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166805744171, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 577.2062683105469, "epoch": 0.5127220355256841, "grad_norm": 0.060984719544649124, "kl": 0.22590798139572144, "learning_rate": 1.1298336319981532e-05, "loss": 0.0347, "reward": 1.0562500178813934, "reward_std": 0.06318792756646871, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166865348816, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 600.9771057128906, "epoch": 0.5130420867338774, "grad_norm": 0.05511503294110298, "kl": 0.13532592430710794, "learning_rate": 1.128725404022171e-05, "loss": 0.027, "reward": 1.0979166984558106, "reward_std": 0.14514823630452156, "rewards/accuracy_reward": 0.10833333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833432674408, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 591.5083526611328, "epoch": 0.5133621379420708, "grad_norm": 0.06461673974990845, "kl": 0.1610957682132721, "learning_rate": 1.1276170152623948e-05, "loss": 0.045, "reward": 1.0687500357627868, "reward_std": 0.07482259795069694, "rewards/accuracy_reward": 0.07916666902601718, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833432674408, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 606.4104370117187, "epoch": 0.5136821891502641, "grad_norm": 0.05804125964641571, "kl": 0.14314365684986113, "learning_rate": 1.1265084671032516e-05, "loss": 0.0558, "reward": 1.1213541984558106, "reward_std": 0.11102346312254667, "rewards/accuracy_reward": 0.13333333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208492279052, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 584.770849609375, "epoch": 0.5140022403584573, "grad_norm": 0.045790914446115494, "kl": 0.17196787595748902, "learning_rate": 1.1253997609293684e-05, "loss": 0.0459, "reward": 1.0552083671092987, "reward_std": 0.16030770651996135, "rewards/accuracy_reward": 0.07291666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916865348816, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 634.3562683105469, "epoch": 0.5143222915666507, "grad_norm": 0.03354915976524353, "kl": 0.10071540996432304, "learning_rate": 1.1242908981255676e-05, "loss": 0.0336, "reward": 1.0520833611488343, "reward_std": 0.06626205574721097, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833492279053, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 600.9458526611328, "epoch": 0.514642342774844, "grad_norm": 0.06027187034487724, "kl": 0.1353273831307888, "learning_rate": 1.1231818800768696e-05, "loss": 0.0616, "reward": 1.064062511920929, "reward_std": 0.08453094661235809, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 602.9979309082031, "epoch": 0.5149623939830373, "grad_norm": 0.08312620967626572, "kl": 0.10024843215942383, "learning_rate": 1.122072708168487e-05, "loss": 0.0348, "reward": 1.0635416865348817, "reward_std": 0.05407893825322389, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9906250178813935, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 617.764599609375, "epoch": 0.5152824451912306, "grad_norm": 0.13844886422157288, "kl": 0.30158936940133574, "learning_rate": 1.1209633837858256e-05, "loss": 0.0307, "reward": 1.043750023841858, "reward_std": 0.08770579267293215, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 599.081265258789, "epoch": 0.5156024963994239, "grad_norm": 0.08852372318506241, "kl": 0.14461980909109115, "learning_rate": 1.1198539083144808e-05, "loss": 0.0344, "reward": 1.0302083671092988, "reward_std": 0.11091033667325974, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 583.8437744140625, "epoch": 0.5159225476076172, "grad_norm": 0.05620553344488144, "kl": 0.1273003876209259, "learning_rate": 1.1187442831402378e-05, "loss": 0.0337, "reward": 1.028125035762787, "reward_std": 0.11089132819324732, "rewards/accuracy_reward": 0.043750000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 624.5875244140625, "epoch": 0.5162425988158106, "grad_norm": 0.07525043934583664, "kl": 0.13220786526799203, "learning_rate": 1.1176345096490671e-05, "loss": 0.0326, "reward": 1.1453125298023223, "reward_std": 0.12826823052018882, "rewards/accuracy_reward": 0.16666667368263005, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 603.2125244140625, "epoch": 0.5165626500240038, "grad_norm": 0.06219992786645889, "kl": 0.2638593137264252, "learning_rate": 1.1165245892271265e-05, "loss": 0.0666, "reward": 1.1098958551883698, "reward_std": 0.11750640124082565, "rewards/accuracy_reward": 0.13125000409781934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 608.9750122070312, "epoch": 0.5168827012321972, "grad_norm": 0.1037401407957077, "kl": 0.49187539964914323, "learning_rate": 1.1154145232607558e-05, "loss": 0.0055, "reward": 1.1229166865348816, "reward_std": 0.10175382327288389, "rewards/accuracy_reward": 0.13958333674818277, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333432674408, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 606.4583526611328, "epoch": 0.5172027524403905, "grad_norm": 0.09893354773521423, "kl": 0.33075669668614865, "learning_rate": 1.114304313136477e-05, "loss": 0.03, "reward": 1.0557292103767395, "reward_std": 0.10697339177131653, "rewards/accuracy_reward": 0.07291666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125178813935, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 608.6666839599609, "epoch": 0.5175228036485837, "grad_norm": 0.10171794891357422, "kl": 0.172719369456172, "learning_rate": 1.1131939602409926e-05, "loss": 0.0272, "reward": 1.1250000298023224, "reward_std": 0.11128932349383831, "rewards/accuracy_reward": 0.13958333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166865348816, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 640.4916900634765, "epoch": 0.5178428548567771, "grad_norm": 0.08092484623193741, "kl": 0.38851368948817255, "learning_rate": 1.1120834659611832e-05, "loss": 0.0293, "reward": 1.115104192495346, "reward_std": 0.079699626006186, "rewards/accuracy_reward": 0.13125000409781934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 586.6375122070312, "epoch": 0.5181629060649704, "grad_norm": 0.13454897701740265, "kl": 0.2053068086504936, "learning_rate": 1.1109728316841056e-05, "loss": 0.0206, "reward": 1.025000011920929, "reward_std": 0.08904234617948532, "rewards/accuracy_reward": 0.0375000013038516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987500011920929, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 603.0979400634766, "epoch": 0.5184829572731637, "grad_norm": 0.040230825543403625, "kl": 0.10919744968414306, "learning_rate": 1.1098620587969915e-05, "loss": 0.0143, "reward": 1.082812523841858, "reward_std": 0.05984876081347466, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9932291805744171, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 629.633349609375, "epoch": 0.518803008481357, "grad_norm": 0.05787614732980728, "kl": 0.2819319121539593, "learning_rate": 1.1087511486872461e-05, "loss": 0.0489, "reward": 1.0286458611488343, "reward_std": 0.12565587386488913, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291746139526, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 631.9333618164062, "epoch": 0.5191230596895503, "grad_norm": 0.09872201085090637, "kl": 0.14482049122452736, "learning_rate": 1.1076401027424464e-05, "loss": 0.0377, "reward": 1.032291704416275, "reward_std": 0.11295400913804769, "rewards/accuracy_reward": 0.04375000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416805744172, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 598.5416870117188, "epoch": 0.5194431108977436, "grad_norm": 0.05958201363682747, "kl": 0.17514662258327007, "learning_rate": 1.106528922350338e-05, "loss": 0.0322, "reward": 1.0880208551883697, "reward_std": 0.061942750960588454, "rewards/accuracy_reward": 0.10000000596046447, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208432674408, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 614.2687713623047, "epoch": 0.519763162105937, "grad_norm": 0.16206012666225433, "kl": 0.14412535950541497, "learning_rate": 1.1054176088988352e-05, "loss": 0.0338, "reward": 1.0739583551883698, "reward_std": 0.08700152412056923, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 589.5583526611329, "epoch": 0.5200832133141302, "grad_norm": 0.06881947815418243, "kl": 0.1701977513730526, "learning_rate": 1.1043061637760184e-05, "loss": 0.0415, "reward": 1.090104192495346, "reward_std": 0.13068218622356653, "rewards/accuracy_reward": 0.10833333767950534, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708551883697, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 587.1791870117188, "epoch": 0.5204032645223235, "grad_norm": 0.09448045492172241, "kl": 0.2662590444087982, "learning_rate": 1.1031945883701319e-05, "loss": 0.0261, "reward": 1.0036458551883698, "reward_std": 0.11030229702591896, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.982812511920929, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 642.1333557128906, "epoch": 0.5207233157305169, "grad_norm": 0.2936466634273529, "kl": 0.3425960190594196, "learning_rate": 1.1020828840695836e-05, "loss": 0.059, "reward": 1.071354192495346, "reward_std": 0.11369431018829346, "rewards/accuracy_reward": 0.09375000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 639.9271057128906, "epoch": 0.5210433669387102, "grad_norm": 0.12267967313528061, "kl": 0.16440969109535217, "learning_rate": 1.1009710522629415e-05, "loss": 0.0323, "reward": 1.0927083492279053, "reward_std": 0.11118660233914852, "rewards/accuracy_reward": 0.10416666846722364, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9864583432674408, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 638.9687744140625, "epoch": 0.5213634181469035, "grad_norm": 0.07434359192848206, "kl": 0.13105014562606812, "learning_rate": 1.099859094338934e-05, "loss": 0.0258, "reward": 1.0406250417232514, "reward_std": 0.11527959946542979, "rewards/accuracy_reward": 0.05000000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9906250059604644, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 593.7312683105469, "epoch": 0.5216834693550968, "grad_norm": 0.06500350683927536, "kl": 0.12343095205724239, "learning_rate": 1.0987470116864454e-05, "loss": 0.0325, "reward": 1.1812500298023223, "reward_std": 0.0721820143982768, "rewards/accuracy_reward": 0.18958333935588598, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666746139526, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 604.7062683105469, "epoch": 0.5220035205632901, "grad_norm": 0.057139065116643906, "kl": 0.28337795436382296, "learning_rate": 1.0976348056945176e-05, "loss": 0.0548, "reward": 1.0281250298023223, "reward_std": 0.11384273990988732, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416865348815, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 608.2166870117187, "epoch": 0.5223235717714835, "grad_norm": 0.05733027681708336, "kl": 0.19319367222487926, "learning_rate": 1.096522477752345e-05, "loss": 0.0497, "reward": 1.0739583492279052, "reward_std": 0.1705170389264822, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 587.8000244140625, "epoch": 0.5226436229796767, "grad_norm": 0.06594210118055344, "kl": 0.1955754801630974, "learning_rate": 1.0954100292492758e-05, "loss": 0.0696, "reward": 1.0895833611488341, "reward_std": 0.16243830658495426, "rewards/accuracy_reward": 0.10833333749324084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500178813934, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 608.0229431152344, "epoch": 0.52296367418787, "grad_norm": 0.10943306237459183, "kl": 0.1569736622273922, "learning_rate": 1.0942974615748069e-05, "loss": 0.0473, "reward": 1.0062500178813933, "reward_std": 0.08850672990083694, "rewards/accuracy_reward": 0.020833334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166805744171, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 628.8208618164062, "epoch": 0.5232837253960634, "grad_norm": 0.06816142052412033, "kl": 0.20981598347425462, "learning_rate": 1.0931847761185863e-05, "loss": 0.0467, "reward": 1.0416666924953462, "reward_std": 0.13459960371255875, "rewards/accuracy_reward": 0.06041666902601719, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9791666805744171, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 653.4812744140625, "epoch": 0.5236037766042567, "grad_norm": 0.08721835911273956, "kl": 0.2609092280268669, "learning_rate": 1.0920719742704071e-05, "loss": 0.027, "reward": 1.1067708551883697, "reward_std": 0.1595559787005186, "rewards/accuracy_reward": 0.13125000409781934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208551883697, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 589.3937683105469, "epoch": 0.52392382781245, "grad_norm": 0.09751680493354797, "kl": 0.16274499967694284, "learning_rate": 1.0909590574202094e-05, "loss": 0.0467, "reward": 1.0932291805744172, "reward_std": 0.15058468338102102, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791805744171, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 623.6104309082032, "epoch": 0.5242438790206433, "grad_norm": 0.05089619755744934, "kl": 0.1571653764694929, "learning_rate": 1.0898460269580753e-05, "loss": 0.025, "reward": 1.060937523841858, "reward_std": 0.11460729204118252, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 621.3625213623047, "epoch": 0.5245639302288366, "grad_norm": 0.05131271854043007, "kl": 0.19617617279291152, "learning_rate": 1.0887328842742307e-05, "loss": 0.061, "reward": 1.0338541924953462, "reward_std": 0.10966868083924056, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9796875178813934, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 617.6271057128906, "epoch": 0.52488398143703, "grad_norm": 0.11150997877120972, "kl": 0.29234243743121624, "learning_rate": 1.0876196307590396e-05, "loss": 0.0396, "reward": 1.129687523841858, "reward_std": 0.16783834397792816, "rewards/accuracy_reward": 0.1479166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 579.3250183105469, "epoch": 0.5252040326452232, "grad_norm": 0.07527544349431992, "kl": 0.12925414890050888, "learning_rate": 1.0865062678030065e-05, "loss": 0.0348, "reward": 1.1135416984558106, "reward_std": 0.12643732279539108, "rewards/accuracy_reward": 0.12500000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416865348816, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 630.1187683105469, "epoch": 0.5255240838534165, "grad_norm": 0.11255325376987457, "kl": 0.1748013935983181, "learning_rate": 1.0853927967967705e-05, "loss": 0.0423, "reward": 1.0729166984558105, "reward_std": 0.1471638258546591, "rewards/accuracy_reward": 0.09166666958481073, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9791666865348816, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 635.6479370117188, "epoch": 0.5258441350616099, "grad_norm": 0.13885398209095, "kl": 0.19732209667563438, "learning_rate": 1.0842792191311079e-05, "loss": 0.028, "reward": 1.0838541984558105, "reward_std": 0.10696388110518455, "rewards/accuracy_reward": 0.09583333767950535, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208432674408, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 615.5771057128907, "epoch": 0.5261641862698032, "grad_norm": 0.17584773898124695, "kl": 0.21706494837999343, "learning_rate": 1.0831655361969263e-05, "loss": 0.0298, "reward": 1.0614583492279053, "reward_std": 0.05746750514954328, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 614.0312622070312, "epoch": 0.5264842374779964, "grad_norm": 0.09475237876176834, "kl": 0.2863244879990816, "learning_rate": 1.0820517493852655e-05, "loss": 0.0454, "reward": 1.1447917103767395, "reward_std": 0.10612938590347767, "rewards/accuracy_reward": 0.16041667126119136, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750178813935, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 617.1062744140625, "epoch": 0.5268042886861898, "grad_norm": 0.07149934023618698, "kl": 0.09891332313418388, "learning_rate": 1.0809378600872957e-05, "loss": 0.0438, "reward": 1.0531250357627868, "reward_std": 0.06739842891693115, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750298023224, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 630.9021057128906, "epoch": 0.5271243398943831, "grad_norm": 0.046160925179719925, "kl": 0.23046648167073727, "learning_rate": 1.0798238696943144e-05, "loss": 0.0443, "reward": 1.0125000119209289, "reward_std": 0.10311015080660582, "rewards/accuracy_reward": 0.03125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 644.7291931152344, "epoch": 0.5274443911025765, "grad_norm": 0.1453281044960022, "kl": 0.19713319800794124, "learning_rate": 1.0787097795977447e-05, "loss": 0.0627, "reward": 1.0276041746139526, "reward_std": 0.12270144894719123, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541746139527, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 614.435433959961, "epoch": 0.5277644423107697, "grad_norm": 0.09991775453090668, "kl": 0.25720389261841775, "learning_rate": 1.077595591189136e-05, "loss": 0.0617, "reward": 1.0979166984558106, "reward_std": 0.11305834613740444, "rewards/accuracy_reward": 0.11666666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 678.664599609375, "epoch": 0.528084493518963, "grad_norm": 0.049913953989744186, "kl": 0.16697454005479812, "learning_rate": 1.0764813058601591e-05, "loss": 0.0386, "reward": 1.0432291984558106, "reward_std": 0.1091542337089777, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958492279053, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 629.8896087646484, "epoch": 0.5284045447271564, "grad_norm": 0.07997886091470718, "kl": 0.245851968228817, "learning_rate": 1.0753669250026062e-05, "loss": 0.0377, "reward": 1.109375035762787, "reward_std": 0.1784997694194317, "rewards/accuracy_reward": 0.1312500026077032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 592.6104370117188, "epoch": 0.5287245959353497, "grad_norm": 0.07213015109300613, "kl": 0.15853434652090073, "learning_rate": 1.0742524500083891e-05, "loss": 0.0473, "reward": 1.0869792103767395, "reward_std": 0.14299752824008466, "rewards/accuracy_reward": 0.10208333861082793, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958432674408, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 620.9937713623046, "epoch": 0.5290446471435429, "grad_norm": 0.1067395806312561, "kl": 0.22370605766773224, "learning_rate": 1.0731378822695368e-05, "loss": 0.0714, "reward": 1.1234375298023225, "reward_std": 0.10482490509748459, "rewards/accuracy_reward": 0.1458333373069763, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9755208492279053, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 640.8729370117187, "epoch": 0.5293646983517363, "grad_norm": 0.23465663194656372, "kl": 0.2824744485318661, "learning_rate": 1.0720232231781944e-05, "loss": 0.0604, "reward": 1.0869791865348817, "reward_std": 0.09580317065119744, "rewards/accuracy_reward": 0.11041666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 637.6833557128906, "epoch": 0.5296847495599296, "grad_norm": 0.09326420724391937, "kl": 0.2504761453717947, "learning_rate": 1.070908474126621e-05, "loss": 0.055, "reward": 1.1098958730697632, "reward_std": 0.13805349618196489, "rewards/accuracy_reward": 0.1312500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 605.3979339599609, "epoch": 0.530004800768123, "grad_norm": 0.09102252870798111, "kl": 0.20697028711438178, "learning_rate": 1.069793636507188e-05, "loss": 0.054, "reward": 1.0390625178813935, "reward_std": 0.09756367336958646, "rewards/accuracy_reward": 0.0583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9807291805744172, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 627.1354370117188, "epoch": 0.5303248519763162, "grad_norm": 0.09323302656412125, "kl": 0.20649093016982079, "learning_rate": 1.0686787117123776e-05, "loss": 0.0613, "reward": 1.0656250059604644, "reward_std": 0.11614571772515773, "rewards/accuracy_reward": 0.08750000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250119209289, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 650.2395935058594, "epoch": 0.5306449031845095, "grad_norm": 0.15243440866470337, "kl": 0.23995474874973297, "learning_rate": 1.067563701134781e-05, "loss": 0.0722, "reward": 1.1114583551883697, "reward_std": 0.14056676384061575, "rewards/accuracy_reward": 0.14166667088866233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916746139527, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 614.4375305175781, "epoch": 0.5309649543927029, "grad_norm": 0.07021994888782501, "kl": 0.1741291381418705, "learning_rate": 1.0664486061670957e-05, "loss": 0.0343, "reward": 1.0968750357627868, "reward_std": 0.1501768246293068, "rewards/accuracy_reward": 0.11666667386889458, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 600.739599609375, "epoch": 0.5312850056008962, "grad_norm": 0.11777395009994507, "kl": 0.4072952277958393, "learning_rate": 1.0653334282021261e-05, "loss": 0.0914, "reward": 1.1380208611488343, "reward_std": 0.18841511830687524, "rewards/accuracy_reward": 0.17291667219251394, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041865348816, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 642.8521057128906, "epoch": 0.5316050568090894, "grad_norm": 0.09537654370069504, "kl": 0.15284304693341255, "learning_rate": 1.0642181686327788e-05, "loss": 0.0651, "reward": 1.052604180574417, "reward_std": 0.12506925128400326, "rewards/accuracy_reward": 0.07083333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 611.5312683105469, "epoch": 0.5319251080172828, "grad_norm": 0.15618105232715607, "kl": 0.3179732132703066, "learning_rate": 1.0631028288520634e-05, "loss": 0.0555, "reward": 1.0354166805744172, "reward_std": 0.11091351509094238, "rewards/accuracy_reward": 0.05625000018626451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 621.3104370117187, "epoch": 0.5322451592254761, "grad_norm": 0.2050512731075287, "kl": 0.3762538552284241, "learning_rate": 1.0619874102530886e-05, "loss": 0.1004, "reward": 1.0651041924953462, "reward_std": 0.18217533379793166, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9651041805744172, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 611.4875244140625, "epoch": 0.5325652104336693, "grad_norm": 0.10614674538373947, "kl": 0.1920756734907627, "learning_rate": 1.0608719142290626e-05, "loss": 0.052, "reward": 1.0640625298023223, "reward_std": 0.12579189017415046, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9807291865348816, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 645.9229370117188, "epoch": 0.5328852616418627, "grad_norm": 0.11540885269641876, "kl": 0.31467584148049355, "learning_rate": 1.0597563421732899e-05, "loss": 0.0566, "reward": 1.026041680574417, "reward_std": 0.12655243016779422, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 609.0500183105469, "epoch": 0.533205312850056, "grad_norm": 0.24006302654743195, "kl": 0.22442513927817345, "learning_rate": 1.0586406954791702e-05, "loss": 0.0654, "reward": 1.152604204416275, "reward_std": 0.13982443250715731, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208551883697, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 601.9833587646484, "epoch": 0.5335253640582494, "grad_norm": 0.24842822551727295, "kl": 0.45479761958122256, "learning_rate": 1.0575249755401952e-05, "loss": 0.0873, "reward": 1.1083333611488342, "reward_std": 0.16125447899103165, "rewards/accuracy_reward": 0.1354166718199849, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9708333551883698, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 608.7250183105468, "epoch": 0.5338454152664426, "grad_norm": 0.14025604724884033, "kl": 0.47216527312994006, "learning_rate": 1.0564091837499503e-05, "loss": 0.0835, "reward": 1.0875000178813934, "reward_std": 0.1817237138748169, "rewards/accuracy_reward": 0.12291666939854622, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9625000059604645, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 640.3666931152344, "epoch": 0.5341654664746359, "grad_norm": 0.2198648303747177, "kl": 0.239653842151165, "learning_rate": 1.0552933215021088e-05, "loss": 0.0429, "reward": 1.0875000178813934, "reward_std": 0.11936910003423691, "rewards/accuracy_reward": 0.1104166716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 615.527099609375, "epoch": 0.5344855176828293, "grad_norm": 0.1614704430103302, "kl": 0.3589115433394909, "learning_rate": 1.0541773901904327e-05, "loss": 0.1041, "reward": 1.105729192495346, "reward_std": 0.1668396320194006, "rewards/accuracy_reward": 0.1375000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291865348815, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 632.5521057128906, "epoch": 0.5348055688910226, "grad_norm": 0.30173394083976746, "kl": 0.4323131963610649, "learning_rate": 1.0530613912087698e-05, "loss": 0.0656, "reward": 1.0489583671092988, "reward_std": 0.10187356732785702, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416865348815, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 585.1250183105469, "epoch": 0.5351256200992158, "grad_norm": 0.349128395318985, "kl": 0.334938682615757, "learning_rate": 1.0519453259510535e-05, "loss": 0.0533, "reward": 1.0223958611488342, "reward_std": 0.14337569773197173, "rewards/accuracy_reward": 0.05000000242143869, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9703125178813934, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 651.4416931152343, "epoch": 0.5354456713074092, "grad_norm": 0.20258475840091705, "kl": 0.4476259782910347, "learning_rate": 1.0508291958112988e-05, "loss": 0.0911, "reward": 1.0401041984558106, "reward_std": 0.15322203636169435, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.967187511920929, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 616.427099609375, "epoch": 0.5357657225156025, "grad_norm": 0.1804238259792328, "kl": 0.31707819551229477, "learning_rate": 1.0497130021836023e-05, "loss": 0.0828, "reward": 1.1041666984558105, "reward_std": 0.17293839827179908, "rewards/accuracy_reward": 0.12708333879709244, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 635.5416931152344, "epoch": 0.5360857737237958, "grad_norm": 0.23864604532718658, "kl": 0.4047357439994812, "learning_rate": 1.0485967464621401e-05, "loss": 0.1048, "reward": 1.068229192495346, "reward_std": 0.16022577956318856, "rewards/accuracy_reward": 0.1020833369344473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 623.5458435058594, "epoch": 0.5364058249319891, "grad_norm": 0.20877757668495178, "kl": 0.3747469946742058, "learning_rate": 1.0474804300411652e-05, "loss": 0.0864, "reward": 1.0338541924953462, "reward_std": 0.16450630873441696, "rewards/accuracy_reward": 0.0666666692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875178813935, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 603.9208465576172, "epoch": 0.5367258761401824, "grad_norm": 0.4435693323612213, "kl": 0.4540784254670143, "learning_rate": 1.046364054315007e-05, "loss": 0.1087, "reward": 1.1072917044162751, "reward_std": 0.18432877622544766, "rewards/accuracy_reward": 0.14791667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9593750238418579, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 599.6687652587891, "epoch": 0.5370459273483758, "grad_norm": 0.17657384276390076, "kl": 0.4048570953309536, "learning_rate": 1.0452476206780686e-05, "loss": 0.0706, "reward": 1.0468750238418578, "reward_std": 0.11663634181022645, "rewards/accuracy_reward": 0.0729166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 628.0166870117188, "epoch": 0.5373659785565691, "grad_norm": 0.23862002789974213, "kl": 0.5527682453393936, "learning_rate": 1.0441311305248258e-05, "loss": 0.0863, "reward": 1.051041704416275, "reward_std": 0.17258851006627082, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416805744171, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 640.6208587646485, "epoch": 0.5376860297647623, "grad_norm": 0.4156144857406616, "kl": 0.6809851691126824, "learning_rate": 1.043014585249825e-05, "loss": 0.0852, "reward": 0.9619791805744171, "reward_std": 0.13484069108963012, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9557291805744171, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 624.408349609375, "epoch": 0.5380060809729557, "grad_norm": 0.28923049569129944, "kl": 0.6403138026595115, "learning_rate": 1.041897986247681e-05, "loss": 0.1207, "reward": 1.0343750178813935, "reward_std": 0.1490859840065241, "rewards/accuracy_reward": 0.07708333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916865348816, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 606.8729370117187, "epoch": 0.538326132181149, "grad_norm": 0.3962026834487915, "kl": 0.5739488750696182, "learning_rate": 1.0407813349130758e-05, "loss": 0.0971, "reward": 1.0812500357627868, "reward_std": 0.19865366518497468, "rewards/accuracy_reward": 0.12500000316649676, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9562500059604645, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 618.3541809082031, "epoch": 0.5386461833893423, "grad_norm": 0.3626198172569275, "kl": 0.4270638257265091, "learning_rate": 1.039664632640757e-05, "loss": 0.1173, "reward": 0.9994791924953461, "reward_std": 0.1307765144854784, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9598958551883697, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 587.3354370117188, "epoch": 0.5389662345975356, "grad_norm": 0.1408691704273224, "kl": 0.4760843113064766, "learning_rate": 1.0385478808255358e-05, "loss": 0.0734, "reward": 0.9770833492279053, "reward_std": 0.12489923723042011, "rewards/accuracy_reward": 0.012500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833492279052, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 605.3104370117187, "epoch": 0.5392862858057289, "grad_norm": 0.2985508143901825, "kl": 0.3501432552933693, "learning_rate": 1.0374310808622857e-05, "loss": 0.0841, "reward": 0.9781250119209289, "reward_std": 0.11064638346433639, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916746139527, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 613.0687683105468, "epoch": 0.5396063370139222, "grad_norm": 0.40105709433555603, "kl": 0.47514125406742097, "learning_rate": 1.0363142341459388e-05, "loss": 0.1189, "reward": 1.008854192495346, "reward_std": 0.1446002159267664, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9609375178813935, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 568.7958557128907, "epoch": 0.5399263882221156, "grad_norm": 0.17932024598121643, "kl": 0.4909600533545017, "learning_rate": 1.0351973420714878e-05, "loss": 0.0821, "reward": 1.1140625178813934, "reward_std": 0.15720976814627646, "rewards/accuracy_reward": 0.137500006146729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 570.2937683105469, "epoch": 0.5402464394303088, "grad_norm": 0.1724947839975357, "kl": 0.33601075038313866, "learning_rate": 1.0340804060339797e-05, "loss": 0.0944, "reward": 0.9734375178813934, "reward_std": 0.10235114470124244, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.935937511920929, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 532.4458557128906, "epoch": 0.5405664906385022, "grad_norm": 0.1991930603981018, "kl": 0.36307480111718177, "learning_rate": 1.0329634274285189e-05, "loss": 0.0666, "reward": 1.0364583492279054, "reward_std": 0.09679662808775902, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 549.658349609375, "epoch": 0.5408865418466955, "grad_norm": 0.24091768264770508, "kl": 0.574387788772583, "learning_rate": 1.031846407650261e-05, "loss": 0.1114, "reward": 1.054166704416275, "reward_std": 0.15670420825481415, "rewards/accuracy_reward": 0.08958333600312471, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833551883698, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 557.4583557128906, "epoch": 0.5412065930548888, "grad_norm": 0.2540891766548157, "kl": 0.48114641904830935, "learning_rate": 1.030729348094414e-05, "loss": 0.1407, "reward": 1.071354192495346, "reward_std": 0.14129463881254195, "rewards/accuracy_reward": 0.09791667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 563.5500244140625, "epoch": 0.5415266442630821, "grad_norm": 0.3592195510864258, "kl": 0.6452970117330551, "learning_rate": 1.0296122501562347e-05, "loss": 0.1715, "reward": 1.0239583671092987, "reward_std": 0.18306272029876708, "rewards/accuracy_reward": 0.06666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916865348816, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 528.8125244140625, "epoch": 0.5418466954712754, "grad_norm": 0.192162424325943, "kl": 0.4380524292588234, "learning_rate": 1.0284951152310292e-05, "loss": 0.149, "reward": 1.006250011920929, "reward_std": 0.13173125982284545, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 542.4479309082031, "epoch": 0.5421667466794687, "grad_norm": 0.3079032301902771, "kl": 0.46010053232312204, "learning_rate": 1.0273779447141487e-05, "loss": 0.0816, "reward": 1.0546875238418578, "reward_std": 0.13184548616409303, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 533.2458526611329, "epoch": 0.5424867978876621, "grad_norm": 0.2043299674987793, "kl": 0.27213485464453696, "learning_rate": 1.0262607400009895e-05, "loss": 0.1073, "reward": 1.1109375357627869, "reward_std": 0.1973067745566368, "rewards/accuracy_reward": 0.13541667088866233, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 552.2250152587891, "epoch": 0.5428068490958553, "grad_norm": 0.33777788281440735, "kl": 0.27160152047872543, "learning_rate": 1.0251435024869894e-05, "loss": 0.0876, "reward": 1.1229166865348816, "reward_std": 0.1457744762301445, "rewards/accuracy_reward": 0.14583333842456342, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833373069763, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 523.5833465576172, "epoch": 0.5431269003040486, "grad_norm": 0.19615255296230316, "kl": 0.38076775074005126, "learning_rate": 1.0240262335676294e-05, "loss": 0.1301, "reward": 1.1328125178813935, "reward_std": 0.11095966622233391, "rewards/accuracy_reward": 0.1541666716337204, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9765625119209289, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 557.7187652587891, "epoch": 0.543446951512242, "grad_norm": 0.23194903135299683, "kl": 0.2781291104853153, "learning_rate": 1.0229089346384273e-05, "loss": 0.0444, "reward": 1.053125023841858, "reward_std": 0.12236902713775635, "rewards/accuracy_reward": 0.07083333469927311, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 545.3187744140625, "epoch": 0.5437670027204353, "grad_norm": 0.3693836033344269, "kl": 0.6928737178444863, "learning_rate": 1.0217916070949405e-05, "loss": 0.1238, "reward": 1.1114583730697631, "reward_std": 0.17411674037575722, "rewards/accuracy_reward": 0.1437500011175871, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083551883697, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 566.377099609375, "epoch": 0.5440870539286286, "grad_norm": 0.2183925211429596, "kl": 0.44937950894236567, "learning_rate": 1.02067425233276e-05, "loss": 0.139, "reward": 1.1026041865348817, "reward_std": 0.16677277013659478, "rewards/accuracy_reward": 0.13958333786576987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208492279053, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 550.3729309082031, "epoch": 0.5444071051368219, "grad_norm": 0.16149164736270905, "kl": 0.36703067272901535, "learning_rate": 1.0195568717475128e-05, "loss": 0.0896, "reward": 1.0265625178813935, "reward_std": 0.14087636768817902, "rewards/accuracy_reward": 0.054166668094694614, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9723958432674408, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 503.1750122070313, "epoch": 0.5447271563450152, "grad_norm": 0.37145406007766724, "kl": 0.3414155296981335, "learning_rate": 1.0184394667348572e-05, "loss": 0.1194, "reward": 1.0833333492279054, "reward_std": 0.13669775277376175, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 528.1791809082031, "epoch": 0.5450472075532086, "grad_norm": 0.15215015411376953, "kl": 0.2324100576341152, "learning_rate": 1.0173220386904817e-05, "loss": 0.0564, "reward": 1.0458333671092988, "reward_std": 0.09034402184188366, "rewards/accuracy_reward": 0.056250002793967725, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833432674408, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 504.6291778564453, "epoch": 0.5453672587614018, "grad_norm": 0.17038388550281525, "kl": 0.5619470663368702, "learning_rate": 1.016204589010104e-05, "loss": 0.1342, "reward": 1.1770833611488343, "reward_std": 0.1644980400800705, "rewards/accuracy_reward": 0.20625000745058059, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 550.4396057128906, "epoch": 0.5456873099695951, "grad_norm": 0.11146536469459534, "kl": 0.30440557897090914, "learning_rate": 1.0150871190894693e-05, "loss": 0.1285, "reward": 1.051041704416275, "reward_std": 0.11958832629024982, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.969791692495346, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 530.3520965576172, "epoch": 0.5460073611777885, "grad_norm": 0.14789950847625732, "kl": 0.2733158372342587, "learning_rate": 1.0139696303243471e-05, "loss": 0.08, "reward": 1.0864583432674408, "reward_std": 0.04624491911381483, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 548.9750183105468, "epoch": 0.5463274123859817, "grad_norm": 0.09990892559289932, "kl": 0.3074110925197601, "learning_rate": 1.0128521241105312e-05, "loss": 0.0853, "reward": 0.9968750178813934, "reward_std": 0.0871005192399025, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916746139526, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 506.25001525878906, "epoch": 0.546647463594175, "grad_norm": 0.13222353160381317, "kl": 0.2891290545463562, "learning_rate": 1.0117346018438367e-05, "loss": 0.0606, "reward": 1.1437500238418579, "reward_std": 0.14657528325915337, "rewards/accuracy_reward": 0.15833333786576986, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166686534882, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 512.4229370117188, "epoch": 0.5469675148023684, "grad_norm": 0.08542126417160034, "kl": 0.1715974800288677, "learning_rate": 1.0106170649200985e-05, "loss": 0.0243, "reward": 1.0723958730697631, "reward_std": 0.06993448249995708, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458492279053, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 541.3270935058594, "epoch": 0.5472875660105617, "grad_norm": 0.12961845099925995, "kl": 0.29536170735955236, "learning_rate": 1.0094995147351715e-05, "loss": 0.0834, "reward": 1.0854166984558105, "reward_std": 0.09780984222888947, "rewards/accuracy_reward": 0.10000000335276127, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 531.302099609375, "epoch": 0.547607617218755, "grad_norm": 0.10168123990297318, "kl": 0.19317631945014, "learning_rate": 1.008381952684925e-05, "loss": 0.0354, "reward": 1.0630208671092987, "reward_std": 0.08708528894931078, "rewards/accuracy_reward": 0.07291666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 534.3437652587891, "epoch": 0.5479276684269483, "grad_norm": 0.11124243587255478, "kl": 0.2048973672091961, "learning_rate": 1.0072643801652442e-05, "loss": 0.0773, "reward": 1.0687500178813933, "reward_std": 0.1460909903049469, "rewards/accuracy_reward": 0.09166667070239783, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 531.2229370117187, "epoch": 0.5482477196351416, "grad_norm": 0.0702410340309143, "kl": 0.2370643712580204, "learning_rate": 1.006146798572027e-05, "loss": 0.0927, "reward": 1.0328125357627869, "reward_std": 0.14687610492110253, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9828125059604644, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 540.8208465576172, "epoch": 0.548567770843335, "grad_norm": 0.06981991976499557, "kl": 0.22894330993294715, "learning_rate": 1.0050292093011835e-05, "loss": 0.0763, "reward": 1.0192708492279052, "reward_std": 0.052289125695824626, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 549.9583465576172, "epoch": 0.5488878220515282, "grad_norm": 0.10607703030109406, "kl": 0.28106397688388823, "learning_rate": 1.0039116137486323e-05, "loss": 0.0508, "reward": 1.1041666924953462, "reward_std": 0.1148946724832058, "rewards/accuracy_reward": 0.11666667070239782, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9833333432674408, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 552.5750183105469, "epoch": 0.5492078732597215, "grad_norm": 0.1637829840183258, "kl": 0.25569094344973564, "learning_rate": 1.0027940133103005e-05, "loss": 0.0764, "reward": 1.0239583611488343, "reward_std": 0.12470999825745821, "rewards/accuracy_reward": 0.041666668653488156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916865348816, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 543.1979339599609, "epoch": 0.5495279244679149, "grad_norm": 0.0756787434220314, "kl": 0.1757739432156086, "learning_rate": 1.0016764093821203e-05, "loss": 0.0281, "reward": 1.0666666984558106, "reward_std": 0.08019343838095665, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833373069763, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 546.1271057128906, "epoch": 0.5498479756761082, "grad_norm": 0.045003801584243774, "kl": 0.13475093320012094, "learning_rate": 1.0005588033600305e-05, "loss": 0.0476, "reward": 1.083854192495346, "reward_std": 0.04586947858333588, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541805744171, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 554.1666839599609, "epoch": 0.5501680268843014, "grad_norm": 0.1230769082903862, "kl": 0.2083885557949543, "learning_rate": 9.994411966399699e-06, "loss": 0.0736, "reward": 1.0526041865348816, "reward_std": 0.10287219993770122, "rewards/accuracy_reward": 0.06875000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541865348815, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 550.9979370117187, "epoch": 0.5504880780924948, "grad_norm": 0.1451004594564438, "kl": 0.36494098976254463, "learning_rate": 9.983235906178798e-06, "loss": 0.1206, "reward": 1.0989583611488343, "reward_std": 0.1661299206316471, "rewards/accuracy_reward": 0.1291666727513075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9697916865348816, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 545.5500244140625, "epoch": 0.5508081293006881, "grad_norm": 0.08950361609458923, "kl": 0.330229202657938, "learning_rate": 9.972059866897002e-06, "loss": 0.0704, "reward": 1.0515625178813934, "reward_std": 0.1308392234146595, "rewards/accuracy_reward": 0.07708333488553762, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 557.695849609375, "epoch": 0.5511281805088815, "grad_norm": 0.054417140781879425, "kl": 0.10512780025601387, "learning_rate": 9.960883862513682e-06, "loss": 0.0288, "reward": 1.082291692495346, "reward_std": 0.10033504888415337, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.990625011920929, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 592.5875183105469, "epoch": 0.5514482317170747, "grad_norm": 0.09014873951673508, "kl": 0.20714004188776017, "learning_rate": 9.949707906988165e-06, "loss": 0.0622, "reward": 1.0781250238418578, "reward_std": 0.11389563996344805, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 583.9979370117187, "epoch": 0.551768282925268, "grad_norm": 0.08591794967651367, "kl": 0.17230487614870071, "learning_rate": 9.938532014279731e-06, "loss": 0.0433, "reward": 1.1239583611488342, "reward_std": 0.08865927271544934, "rewards/accuracy_reward": 0.1395833369344473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9843750119209289, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 556.6645904541016, "epoch": 0.5520883341334614, "grad_norm": 0.047448597848415375, "kl": 0.14786509796977043, "learning_rate": 9.927356198347561e-06, "loss": 0.0394, "reward": 1.0640625476837158, "reward_std": 0.09608743041753769, "rewards/accuracy_reward": 0.07500000186264515, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9890625059604645, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 575.5562713623046, "epoch": 0.5524083853416547, "grad_norm": 0.13344469666481018, "kl": 0.33826933801174164, "learning_rate": 9.916180473150753e-06, "loss": 0.0634, "reward": 0.9958333551883698, "reward_std": 0.11803839653730393, "rewards/accuracy_reward": 0.01875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 599.1375122070312, "epoch": 0.5527284365498479, "grad_norm": 0.044267505407333374, "kl": 0.1466631069779396, "learning_rate": 9.905004852648288e-06, "loss": 0.0545, "reward": 1.0765625357627868, "reward_std": 0.1297352697700262, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958551883698, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 576.8937683105469, "epoch": 0.5530484877580413, "grad_norm": 0.11980026960372925, "kl": 0.28844860270619394, "learning_rate": 9.893829350799016e-06, "loss": 0.0464, "reward": 1.1302083611488343, "reward_std": 0.09609245825558901, "rewards/accuracy_reward": 0.1416666716337204, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9864583492279053, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 582.214599609375, "epoch": 0.5533685389662346, "grad_norm": 0.072134830057621, "kl": 0.25654660165309906, "learning_rate": 9.882653981561638e-06, "loss": 0.0195, "reward": 1.0312500238418578, "reward_std": 0.08351408448070288, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9833333492279053, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 570.9604278564453, "epoch": 0.553688590174428, "grad_norm": 0.08518262952566147, "kl": 0.26288840398192403, "learning_rate": 9.871478758894692e-06, "loss": 0.0425, "reward": 1.1421875298023223, "reward_std": 0.14297616370022298, "rewards/accuracy_reward": 0.16458334047347306, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 607.177099609375, "epoch": 0.5540086413826212, "grad_norm": 0.05721981078386307, "kl": 0.2188819907605648, "learning_rate": 9.860303696756528e-06, "loss": 0.045, "reward": 1.1135417103767395, "reward_std": 0.10241693221032619, "rewards/accuracy_reward": 0.13333333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9802083492279052, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 580.2521026611328, "epoch": 0.5543286925908145, "grad_norm": 0.2216687947511673, "kl": 0.12100109234452247, "learning_rate": 9.849128809105309e-06, "loss": 0.0488, "reward": 1.044791692495346, "reward_std": 0.11933745443820953, "rewards/accuracy_reward": 0.058333336189389226, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583432674408, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 582.4812805175782, "epoch": 0.5546487437990079, "grad_norm": 0.07465074211359024, "kl": 0.25847496688365934, "learning_rate": 9.837954109898961e-06, "loss": 0.0534, "reward": 1.0395833730697632, "reward_std": 0.16923051699995995, "rewards/accuracy_reward": 0.06666666772216559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166865348816, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 587.4104370117187, "epoch": 0.5549687950072012, "grad_norm": 0.06225651502609253, "kl": 0.18474493846297263, "learning_rate": 9.826779613095188e-06, "loss": 0.044, "reward": 1.0541666805744172, "reward_std": 0.09043601714074612, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9791666805744171, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 589.4187683105469, "epoch": 0.5552888462153944, "grad_norm": 0.08788628876209259, "kl": 0.15150585621595383, "learning_rate": 9.815605332651433e-06, "loss": 0.0426, "reward": 1.0718750178813934, "reward_std": 0.11696450784802437, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583373069763, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 599.5146118164063, "epoch": 0.5556088974235878, "grad_norm": 0.07596233487129211, "kl": 0.38066075593233106, "learning_rate": 9.804431282524874e-06, "loss": 0.089, "reward": 1.019791692495346, "reward_std": 0.12768033295869827, "rewards/accuracy_reward": 0.04583333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583432674408, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 590.495849609375, "epoch": 0.5559289486317811, "grad_norm": 0.19324903190135956, "kl": 0.40700176954269407, "learning_rate": 9.793257476672403e-06, "loss": 0.0711, "reward": 1.0598958611488343, "reward_std": 0.1368673298507929, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.970312523841858, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 598.3854339599609, "epoch": 0.5562489998399744, "grad_norm": 0.08396127820014954, "kl": 0.2656080096960068, "learning_rate": 9.782083929050601e-06, "loss": 0.0594, "reward": 1.082812535762787, "reward_std": 0.1377605564892292, "rewards/accuracy_reward": 0.10416666921228171, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 619.589599609375, "epoch": 0.5565690510481677, "grad_norm": 0.10959643125534058, "kl": 0.32876670695841315, "learning_rate": 9.77091065361573e-06, "loss": 0.0646, "reward": 1.0072916924953461, "reward_std": 0.1158070158213377, "rewards/accuracy_reward": 0.02916666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9781250178813934, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 654.5187744140625, "epoch": 0.556889102256361, "grad_norm": 0.1902201622724533, "kl": 0.44203677251935003, "learning_rate": 9.759737664323709e-06, "loss": 0.1057, "reward": 1.0968750298023224, "reward_std": 0.14357101432979108, "rewards/accuracy_reward": 0.13333333637565375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416865348816, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 590.4333435058594, "epoch": 0.5572091534645544, "grad_norm": 0.2078673541545868, "kl": 0.5072756253182888, "learning_rate": 9.748564975130106e-06, "loss": 0.1311, "reward": 1.0427083432674409, "reward_std": 0.1215952442958951, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083432674408, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 597.9771118164062, "epoch": 0.5575292046727477, "grad_norm": 0.11386916786432266, "kl": 0.34886466041207315, "learning_rate": 9.737392599990109e-06, "loss": 0.0777, "reward": 1.122916692495346, "reward_std": 0.14374384582042693, "rewards/accuracy_reward": 0.14791667126119137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 610.6396057128907, "epoch": 0.5578492558809409, "grad_norm": 0.1811244934797287, "kl": 0.3296015664935112, "learning_rate": 9.726220552858516e-06, "loss": 0.113, "reward": 1.0166666865348817, "reward_std": 0.1435274824500084, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 612.3541809082031, "epoch": 0.5581693070891343, "grad_norm": 0.2830093204975128, "kl": 0.4410018026828766, "learning_rate": 9.71504884768971e-06, "loss": 0.1006, "reward": 1.091666692495346, "reward_std": 0.17801312804222108, "rewards/accuracy_reward": 0.11875000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9708333492279053, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 631.2833557128906, "epoch": 0.5584893582973276, "grad_norm": 0.30681195855140686, "kl": 0.5721335649490357, "learning_rate": 9.703877498437657e-06, "loss": 0.0964, "reward": 1.029166680574417, "reward_std": 0.11975382026284934, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9604166805744171, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 624.4666931152344, "epoch": 0.5588094095055209, "grad_norm": 0.22259894013404846, "kl": 0.3894990190863609, "learning_rate": 9.692706519055865e-06, "loss": 0.0725, "reward": 1.0526041865348816, "reward_std": 0.14981426876038312, "rewards/accuracy_reward": 0.07916666939854622, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 607.0333557128906, "epoch": 0.5591294607137142, "grad_norm": 0.1207633763551712, "kl": 0.33568143993616106, "learning_rate": 9.681535923497394e-06, "loss": 0.0745, "reward": 1.0114583492279052, "reward_std": 0.14165182113647462, "rewards/accuracy_reward": 0.03958333358168602, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9677083432674408, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 594.6666839599609, "epoch": 0.5594495119219075, "grad_norm": 0.22380438446998596, "kl": 0.346661651134491, "learning_rate": 9.670365725714811e-06, "loss": 0.0865, "reward": 1.076562523841858, "reward_std": 0.14469496812671423, "rewards/accuracy_reward": 0.1125000011175871, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9640625238418579, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 595.5687744140625, "epoch": 0.5597695631301008, "grad_norm": 0.12634803354740143, "kl": 0.41345595121383666, "learning_rate": 9.659195939660203e-06, "loss": 0.0835, "reward": 1.1130208730697633, "reward_std": 0.17043216675519943, "rewards/accuracy_reward": 0.1458333373069763, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9651041805744172, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 604.4729309082031, "epoch": 0.5600896143382941, "grad_norm": 0.10981366038322449, "kl": 0.3390358090400696, "learning_rate": 9.648026579285125e-06, "loss": 0.069, "reward": 1.0333333551883697, "reward_std": 0.16134923771023751, "rewards/accuracy_reward": 0.0562500024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833492279053, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 636.1771057128906, "epoch": 0.5604096655464874, "grad_norm": 0.10159897804260254, "kl": 0.16987637989223003, "learning_rate": 9.636857658540615e-06, "loss": 0.0226, "reward": 1.0968750238418579, "reward_std": 0.1050514079630375, "rewards/accuracy_reward": 0.11041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583492279053, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 629.0166870117188, "epoch": 0.5607297167546808, "grad_norm": 0.09187393635511398, "kl": 0.2695496570318937, "learning_rate": 9.625689191377148e-06, "loss": 0.0551, "reward": 1.1104167103767395, "reward_std": 0.15159886330366135, "rewards/accuracy_reward": 0.13750000465661288, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166865348816, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 643.1916931152343, "epoch": 0.5610497679628741, "grad_norm": 0.4285680651664734, "kl": 0.2703603833913803, "learning_rate": 9.614521191744644e-06, "loss": 0.0776, "reward": 1.0583333611488341, "reward_std": 0.12563749849796296, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000238418579, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 629.8146118164062, "epoch": 0.5613698191710673, "grad_norm": 0.09549083560705185, "kl": 0.2873098261654377, "learning_rate": 9.603353673592435e-06, "loss": 0.0493, "reward": 1.0541666984558105, "reward_std": 0.14787348750978707, "rewards/accuracy_reward": 0.07500000130385161, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9770833611488342, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 608.4833557128907, "epoch": 0.5616898703792607, "grad_norm": 0.19138483703136444, "kl": 0.2949482426047325, "learning_rate": 9.592186650869245e-06, "loss": 0.0604, "reward": 1.1250000417232513, "reward_std": 0.1398756790906191, "rewards/accuracy_reward": 0.14583333879709243, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 606.3729370117187, "epoch": 0.562009921587454, "grad_norm": 0.16488175094127655, "kl": 0.34419357851147653, "learning_rate": 9.581020137523192e-06, "loss": 0.0469, "reward": 1.068750023841858, "reward_std": 0.0808381624519825, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9854166805744171, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 645.8333435058594, "epoch": 0.5623299727956473, "grad_norm": 0.08196991682052612, "kl": 0.24638563096523286, "learning_rate": 9.569854147501752e-06, "loss": 0.0558, "reward": 1.0651041865348816, "reward_std": 0.17485166918486356, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9776041805744171, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 599.8979309082031, "epoch": 0.5626500240038406, "grad_norm": 0.17706768214702606, "kl": 0.2538708359003067, "learning_rate": 9.55868869475174e-06, "loss": 0.0841, "reward": 1.074479204416275, "reward_std": 0.15826401822268962, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.970312523841858, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 625.7750244140625, "epoch": 0.5629700752120339, "grad_norm": 0.08481159806251526, "kl": 0.2495666116476059, "learning_rate": 9.547523793219315e-06, "loss": 0.0402, "reward": 1.0734375417232513, "reward_std": 0.1499858619645238, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9734375238418579, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 592.827099609375, "epoch": 0.5632901264202272, "grad_norm": 0.11646457761526108, "kl": 0.3505744531750679, "learning_rate": 9.536359456849933e-06, "loss": 0.0508, "reward": 1.119791716337204, "reward_std": 0.1696897467598319, "rewards/accuracy_reward": 0.1458333356305957, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583551883697, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 608.7646057128907, "epoch": 0.5636101776284206, "grad_norm": 0.16916526854038239, "kl": 0.3435643449425697, "learning_rate": 9.52519569958835e-06, "loss": 0.0638, "reward": 1.092187523841858, "reward_std": 0.16328086461871863, "rewards/accuracy_reward": 0.1187500024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.973437511920929, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 635.420849609375, "epoch": 0.5639302288366138, "grad_norm": 0.2558411657810211, "kl": 0.39502771496772765, "learning_rate": 9.514032535378604e-06, "loss": 0.0492, "reward": 1.025000023841858, "reward_std": 0.11617990657687187, "rewards/accuracy_reward": 0.043750000186264515, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9791666746139527, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 618.1646057128906, "epoch": 0.5642502800448072, "grad_norm": 0.23282131552696228, "kl": 0.2503409251570702, "learning_rate": 9.50286997816398e-06, "loss": 0.0868, "reward": 1.0854166865348815, "reward_std": 0.1403332645073533, "rewards/accuracy_reward": 0.11041666939854622, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9729166865348816, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 603.9833465576172, "epoch": 0.5645703312530005, "grad_norm": 0.5210441946983337, "kl": 0.37920553535223006, "learning_rate": 9.491708041887017e-06, "loss": 0.1104, "reward": 1.127604204416275, "reward_std": 0.15253622308373452, "rewards/accuracy_reward": 0.15625000447034837, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9692708551883698, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 619.0166809082032, "epoch": 0.5648903824611938, "grad_norm": 0.18722592294216156, "kl": 0.6182475075125694, "learning_rate": 9.480546740489468e-06, "loss": 0.0809, "reward": 1.0296875417232514, "reward_std": 0.15643419921398163, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9588541924953461, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 611.4625183105469, "epoch": 0.5652104336693871, "grad_norm": 0.42721042037010193, "kl": 0.5760065197944642, "learning_rate": 9.469386087912302e-06, "loss": 0.1066, "reward": 1.1250000238418578, "reward_std": 0.1428021177649498, "rewards/accuracy_reward": 0.1562500052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500119209289, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 591.5666809082031, "epoch": 0.5655304848775804, "grad_norm": 0.2159615010023117, "kl": 0.4528600886464119, "learning_rate": 9.458226098095675e-06, "loss": 0.1271, "reward": 1.0973958671092987, "reward_std": 0.15849269963800908, "rewards/accuracy_reward": 0.1395833384245634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9578125178813934, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 621.2062683105469, "epoch": 0.5658505360857737, "grad_norm": 0.2607513964176178, "kl": 0.5638085767626763, "learning_rate": 9.447066784978914e-06, "loss": 0.0944, "reward": 1.0526041984558105, "reward_std": 0.15217989590018988, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9567708492279052, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 620.6875244140625, "epoch": 0.5661705872939671, "grad_norm": 0.28668591380119324, "kl": 0.6076577290892601, "learning_rate": 9.435908162500499e-06, "loss": 0.0706, "reward": 1.0057291746139527, "reward_std": 0.14748432487249374, "rewards/accuracy_reward": 0.039583333395421506, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9661458432674408, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 612.2250152587891, "epoch": 0.5664906385021603, "grad_norm": 0.2156950682401657, "kl": 0.48343914821743966, "learning_rate": 9.42475024459805e-06, "loss": 0.1229, "reward": 1.1572916984558106, "reward_std": 0.2026117168366909, "rewards/accuracy_reward": 0.20000000428408385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916805744172, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 636.0208465576172, "epoch": 0.5668106897103536, "grad_norm": 0.25211068987846375, "kl": 0.4843083009123802, "learning_rate": 9.413593045208303e-06, "loss": 0.0723, "reward": 0.9859375298023224, "reward_std": 0.14214141964912413, "rewards/accuracy_reward": 0.01875000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.967187511920929, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 582.2229248046875, "epoch": 0.567130740918547, "grad_norm": 0.17530737817287445, "kl": 0.41530356407165525, "learning_rate": 9.402436578267106e-06, "loss": 0.0879, "reward": 1.0848958492279053, "reward_std": 0.15454170852899551, "rewards/accuracy_reward": 0.12291667088866234, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9598958492279053, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 608.9562622070313, "epoch": 0.5674507921267403, "grad_norm": 0.1589900255203247, "kl": 0.5026217520236969, "learning_rate": 9.391280857709374e-06, "loss": 0.0764, "reward": 0.9989583551883697, "reward_std": 0.16388722117990256, "rewards/accuracy_reward": 0.03125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083432674408, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 582.2562744140625, "epoch": 0.5677708433349336, "grad_norm": 0.3550090491771698, "kl": 0.3733747750520706, "learning_rate": 9.380125897469116e-06, "loss": 0.0788, "reward": 1.0442708551883697, "reward_std": 0.17154857516288757, "rewards/accuracy_reward": 0.07708333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9671875059604644, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 572.4750152587891, "epoch": 0.5680908945431269, "grad_norm": 0.32089048624038696, "kl": 0.5282266348600387, "learning_rate": 9.36897171147937e-06, "loss": 0.1056, "reward": 1.0885417103767394, "reward_std": 0.21879145503044128, "rewards/accuracy_reward": 0.11666667088866234, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9697916805744171, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 597.164599609375, "epoch": 0.5684109457513202, "grad_norm": 0.3526063859462738, "kl": 0.6023999392986298, "learning_rate": 9.357818313672216e-06, "loss": 0.0963, "reward": 1.0281250178813934, "reward_std": 0.16391725689172745, "rewards/accuracy_reward": 0.06458333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9635416686534881, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 616.9229400634765, "epoch": 0.5687309969595136, "grad_norm": 0.3334544897079468, "kl": 0.6702438533306122, "learning_rate": 9.346665717978742e-06, "loss": 0.1199, "reward": 1.0270833611488341, "reward_std": 0.14344887398183345, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.956250011920929, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 607.3396057128906, "epoch": 0.5690510481677068, "grad_norm": 0.3106757700443268, "kl": 0.7477944895625115, "learning_rate": 9.335513938329046e-06, "loss": 0.1181, "reward": 1.0416666865348816, "reward_std": 0.21114777252078057, "rewards/accuracy_reward": 0.08750000167638064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9541666805744171, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 586.0812683105469, "epoch": 0.5693710993759001, "grad_norm": 0.1424209475517273, "kl": 0.5696847230195999, "learning_rate": 9.324362988652195e-06, "loss": 0.1224, "reward": 1.0354166865348815, "reward_std": 0.169924059510231, "rewards/accuracy_reward": 0.07083333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9645833432674408, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 578.4000213623046, "epoch": 0.5696911505840935, "grad_norm": 0.16978739202022552, "kl": 0.43719258829951285, "learning_rate": 9.313212882876228e-06, "loss": 0.1375, "reward": 1.085937535762787, "reward_std": 0.15211977660655976, "rewards/accuracy_reward": 0.11875000353902579, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.967187511920929, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 602.8708465576171, "epoch": 0.5700112017922868, "grad_norm": 0.2683226466178894, "kl": 0.49288763031363486, "learning_rate": 9.30206363492812e-06, "loss": 0.1011, "reward": 1.0385416924953461, "reward_std": 0.1908488892018795, "rewards/accuracy_reward": 0.07916666697710753, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.959375011920929, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 585.5250183105469, "epoch": 0.57033125300048, "grad_norm": 0.29383039474487305, "kl": 0.6997188687324524, "learning_rate": 9.290915258733792e-06, "loss": 0.1354, "reward": 0.9651041865348816, "reward_std": 0.16285497993230819, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9526041686534882, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 590.9250122070313, "epoch": 0.5706513042086734, "grad_norm": 0.1701168715953827, "kl": 0.5816778719425202, "learning_rate": 9.279767768218058e-06, "loss": 0.1418, "reward": 1.1244792103767396, "reward_std": 0.1965101033449173, "rewards/accuracy_reward": 0.16250000428408384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9619791746139527, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 596.2729431152344, "epoch": 0.5709713554168667, "grad_norm": 0.1655789464712143, "kl": 0.44416755214333536, "learning_rate": 9.268621177304635e-06, "loss": 0.0587, "reward": 1.0848958611488342, "reward_std": 0.10237012188881636, "rewards/accuracy_reward": 0.10208333600312472, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.982812511920929, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 604.1750183105469, "epoch": 0.5712914066250601, "grad_norm": 0.11165868490934372, "kl": 0.3100586123764515, "learning_rate": 9.25747549991611e-06, "loss": 0.0946, "reward": 1.0302083551883698, "reward_std": 0.16860452741384507, "rewards/accuracy_reward": 0.06250000018626452, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9677083492279053, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 556.6375244140625, "epoch": 0.5716114578332533, "grad_norm": 0.19193939864635468, "kl": 0.3580825373530388, "learning_rate": 9.246330749973943e-06, "loss": 0.0696, "reward": 1.1083333492279053, "reward_std": 0.14191762804985047, "rewards/accuracy_reward": 0.12500000540167094, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333373069764, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 572.5541809082031, "epoch": 0.5719315090414466, "grad_norm": 0.06165502592921257, "kl": 0.18685958310961723, "learning_rate": 9.235186941398412e-06, "loss": 0.0567, "reward": 1.0651041865348816, "reward_std": 0.08984843343496322, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 543.8687713623046, "epoch": 0.57225156024964, "grad_norm": 0.21538884937763214, "kl": 0.3270559675991535, "learning_rate": 9.224044088108642e-06, "loss": 0.0818, "reward": 1.1588541984558105, "reward_std": 0.10733593087643385, "rewards/accuracy_reward": 0.18333333935588597, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 564.9062713623047, "epoch": 0.5725716114578333, "grad_norm": 0.1343001127243042, "kl": 0.19680135846138, "learning_rate": 9.212902204022556e-06, "loss": 0.057, "reward": 1.0593750178813934, "reward_std": 0.09946857746690511, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416686534881, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 550.0208587646484, "epoch": 0.5728916626660265, "grad_norm": 0.05754992738366127, "kl": 0.15173882991075516, "learning_rate": 9.20176130305686e-06, "loss": 0.0388, "reward": 1.1447916865348815, "reward_std": 0.07982183620333672, "rewards/accuracy_reward": 0.1541666707023978, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990625, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 565.8437713623047, "epoch": 0.5732117138742199, "grad_norm": 0.2448616623878479, "kl": 0.30400142446160316, "learning_rate": 9.190621399127045e-06, "loss": 0.0644, "reward": 1.0401041865348817, "reward_std": 0.10776591561734676, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708432674408, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 569.3979431152344, "epoch": 0.5735317650824132, "grad_norm": 0.05121641978621483, "kl": 0.16107941642403603, "learning_rate": 9.179482506147346e-06, "loss": 0.0506, "reward": 1.0645833432674408, "reward_std": 0.06739432364702225, "rewards/accuracy_reward": 0.07916666995733976, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9854166746139527, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 567.7750274658204, "epoch": 0.5738518162906064, "grad_norm": 0.17669126391410828, "kl": 0.1536485359072685, "learning_rate": 9.168344638030743e-06, "loss": 0.0414, "reward": 1.0526041984558105, "reward_std": 0.07714264132082463, "rewards/accuracy_reward": 0.06250000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 556.4916900634765, "epoch": 0.5741718674987998, "grad_norm": 0.06108405813574791, "kl": 0.1886889159679413, "learning_rate": 9.157207808688925e-06, "loss": 0.0528, "reward": 1.0994791984558105, "reward_std": 0.10481414943933487, "rewards/accuracy_reward": 0.11458333749324083, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9848958492279053, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 564.9520904541016, "epoch": 0.5744919187069931, "grad_norm": 0.0590064600110054, "kl": 0.25982956662774087, "learning_rate": 9.146072032032298e-06, "loss": 0.0622, "reward": 1.160416692495346, "reward_std": 0.08663471266627312, "rewards/accuracy_reward": 0.1791666718199849, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9812500059604645, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 573.2187774658203, "epoch": 0.5748119699151865, "grad_norm": 0.09085609018802643, "kl": 0.24539805799722672, "learning_rate": 9.134937321969941e-06, "loss": 0.0423, "reward": 1.107291692495346, "reward_std": 0.10802287980914116, "rewards/accuracy_reward": 0.12083333693444728, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9864583373069763, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 540.8791809082031, "epoch": 0.5751320211233797, "grad_norm": 0.05450833588838577, "kl": 0.15207632929086684, "learning_rate": 9.123803692409609e-06, "loss": 0.0306, "reward": 1.0890625238418579, "reward_std": 0.07662020195275546, "rewards/accuracy_reward": 0.10000000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9890625238418579, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 519.5541778564453, "epoch": 0.575452072331573, "grad_norm": 0.12327313423156738, "kl": 0.36361787244677546, "learning_rate": 9.112671157257698e-06, "loss": 0.0845, "reward": 1.0776041984558105, "reward_std": 0.14283109903335572, "rewards/accuracy_reward": 0.09375000391155482, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9838541746139526, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 556.6666809082031, "epoch": 0.5757721235397664, "grad_norm": 0.3154713809490204, "kl": 0.17768015563488007, "learning_rate": 9.101539730419247e-06, "loss": 0.0444, "reward": 1.0208333611488343, "reward_std": 0.11578117497265339, "rewards/accuracy_reward": 0.03333333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.987500011920929, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 551.6021026611328, "epoch": 0.5760921747479597, "grad_norm": 0.08551464974880219, "kl": 0.14677060693502425, "learning_rate": 9.090409425797908e-06, "loss": 0.0544, "reward": 1.0614583492279053, "reward_std": 0.08336172327399254, "rewards/accuracy_reward": 0.07291667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416746139526, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 579.9854370117188, "epoch": 0.5764122259561529, "grad_norm": 0.0542651005089283, "kl": 0.19063802286982537, "learning_rate": 9.07928025729593e-06, "loss": 0.0169, "reward": 1.0145833432674407, "reward_std": 0.05645497292280197, "rewards/accuracy_reward": 0.022916667722165586, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9916666746139526, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 616.9625061035156, "epoch": 0.5767322771643463, "grad_norm": 0.0507982037961483, "kl": 0.19528093002736568, "learning_rate": 9.068152238814139e-06, "loss": 0.0329, "reward": 1.0057291746139527, "reward_std": 0.06221659388393164, "rewards/accuracy_reward": 0.01458333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458432674408, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 578.5396057128906, "epoch": 0.5770523283725396, "grad_norm": 0.10525275021791458, "kl": 0.17930835708975792, "learning_rate": 9.057025384251934e-06, "loss": 0.0635, "reward": 1.0838541865348816, "reward_std": 0.09392364919185639, "rewards/accuracy_reward": 0.09791667070239782, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 548.9771026611328, "epoch": 0.577372379580733, "grad_norm": 0.10307978838682175, "kl": 0.30039278194308283, "learning_rate": 9.045899707507247e-06, "loss": 0.0845, "reward": 1.100000023841858, "reward_std": 0.1839461788535118, "rewards/accuracy_reward": 0.1312500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9687500238418579, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 586.1333557128906, "epoch": 0.5776924307889262, "grad_norm": 0.15342208743095398, "kl": 0.27368993014097215, "learning_rate": 9.034775222476555e-06, "loss": 0.0676, "reward": 1.0489583611488342, "reward_std": 0.12007906846702099, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.978125023841858, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 552.8333557128906, "epoch": 0.5780124819971195, "grad_norm": 0.08706728368997574, "kl": 0.26777110919356345, "learning_rate": 9.023651943054825e-06, "loss": 0.0508, "reward": 1.0380208611488342, "reward_std": 0.11098587065935135, "rewards/accuracy_reward": 0.05000000055879354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208373069763, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 598.747933959961, "epoch": 0.5783325332053129, "grad_norm": 0.1062178835272789, "kl": 0.3234595455229282, "learning_rate": 9.012529883135548e-06, "loss": 0.0528, "reward": 1.0947916984558106, "reward_std": 0.13509288653731347, "rewards/accuracy_reward": 0.11875000223517418, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416746139526, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 610.345849609375, "epoch": 0.5786525844135062, "grad_norm": 0.12420543283224106, "kl": 0.27922215312719345, "learning_rate": 9.001409056610662e-06, "loss": 0.0838, "reward": 0.9921875178813935, "reward_std": 0.09044673759490252, "rewards/accuracy_reward": 0.012500000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 568.7291809082031, "epoch": 0.5789726356216994, "grad_norm": 0.10455825179815292, "kl": 0.2904270239174366, "learning_rate": 8.990289477370587e-06, "loss": 0.0701, "reward": 1.1682292103767395, "reward_std": 0.15833674147725105, "rewards/accuracy_reward": 0.1812500037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791805744171, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 613.3041748046875, "epoch": 0.5792926868298928, "grad_norm": 0.12697599828243256, "kl": 0.43104536086320877, "learning_rate": 8.979171159304166e-06, "loss": 0.0888, "reward": 1.0036458492279052, "reward_std": 0.11862440332770348, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291865348815, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 551.5312622070312, "epoch": 0.5796127380380861, "grad_norm": 0.2758425176143646, "kl": 0.4129337251186371, "learning_rate": 8.968054116298683e-06, "loss": 0.0883, "reward": 1.025000011920929, "reward_std": 0.1669561004266143, "rewards/accuracy_reward": 0.05208333376795053, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9729166746139526, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 607.5604370117187, "epoch": 0.5799327892462794, "grad_norm": 0.07522869855165482, "kl": 0.23817719668149948, "learning_rate": 8.95693836223982e-06, "loss": 0.0411, "reward": 1.043750035762787, "reward_std": 0.13217582330107688, "rewards/accuracy_reward": 0.0604166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 563.7854370117187, "epoch": 0.5802528404544727, "grad_norm": 0.12487686425447464, "kl": 0.3361851140856743, "learning_rate": 8.94582391101165e-06, "loss": 0.0758, "reward": 1.1651041984558106, "reward_std": 0.1337714796885848, "rewards/accuracy_reward": 0.18333333637565374, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708373069763, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 609.3562622070312, "epoch": 0.580572891662666, "grad_norm": 0.08441471308469772, "kl": 0.3453087739646435, "learning_rate": 8.934710776496623e-06, "loss": 0.0425, "reward": 1.0348958611488341, "reward_std": 0.1058173468336463, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458551883698, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 614.9521057128907, "epoch": 0.5808929428708594, "grad_norm": 0.27958735823631287, "kl": 0.3278763361275196, "learning_rate": 8.923598972575537e-06, "loss": 0.0478, "reward": 1.1067708611488343, "reward_std": 0.1621831137686968, "rewards/accuracy_reward": 0.11875000558793544, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208492279052, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 616.8083618164062, "epoch": 0.5812129940790527, "grad_norm": 0.1701308786869049, "kl": 0.27441012263298037, "learning_rate": 8.912488513127539e-06, "loss": 0.0623, "reward": 1.0520833611488343, "reward_std": 0.13372773118317127, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9791666805744171, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 574.4354370117187, "epoch": 0.5815330452872459, "grad_norm": 0.11255865544080734, "kl": 0.484719355404377, "learning_rate": 8.901379412030089e-06, "loss": 0.0888, "reward": 1.1411458551883698, "reward_std": 0.19930556789040565, "rewards/accuracy_reward": 0.1645833369344473, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9744791865348816, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 546.7291900634766, "epoch": 0.5818530964954393, "grad_norm": 0.1106775552034378, "kl": 0.16104906797409058, "learning_rate": 8.89027168315895e-06, "loss": 0.0416, "reward": 1.1359375298023224, "reward_std": 0.0840451443567872, "rewards/accuracy_reward": 0.14791667014360427, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9880208492279052, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 570.6437622070313, "epoch": 0.5821731477036326, "grad_norm": 0.11812810599803925, "kl": 0.2805390991270542, "learning_rate": 8.879165340388171e-06, "loss": 0.059, "reward": 1.0562500178813934, "reward_std": 0.14737335927784442, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.975000011920929, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 558.6979309082031, "epoch": 0.5824931989118259, "grad_norm": 0.10681235790252686, "kl": 0.1654998004436493, "learning_rate": 8.868060397590075e-06, "loss": 0.0493, "reward": 1.1421875357627869, "reward_std": 0.10358922835439444, "rewards/accuracy_reward": 0.1520833395421505, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9880208492279052, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 576.9562622070313, "epoch": 0.5828132501200192, "grad_norm": 0.11712974309921265, "kl": 0.430647674202919, "learning_rate": 8.856956868635233e-06, "loss": 0.075, "reward": 1.1479166984558105, "reward_std": 0.1634374063462019, "rewards/accuracy_reward": 0.17708333842456342, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333551883698, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 556.0479339599609, "epoch": 0.5831333013282125, "grad_norm": 0.16343854367733002, "kl": 0.4074979230761528, "learning_rate": 8.845854767392448e-06, "loss": 0.079, "reward": 1.1473958849906922, "reward_std": 0.1910140451043844, "rewards/accuracy_reward": 0.1708333410322666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625178813935, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 625.0375244140625, "epoch": 0.5834533525364058, "grad_norm": 0.16372530162334442, "kl": 0.3780060760676861, "learning_rate": 8.834754107728738e-06, "loss": 0.032, "reward": 1.0942708730697632, "reward_std": 0.1673297893255949, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 594.7833526611328, "epoch": 0.5837734037445992, "grad_norm": 0.13239432871341705, "kl": 0.326027612388134, "learning_rate": 8.82365490350933e-06, "loss": 0.0631, "reward": 1.1911458730697633, "reward_std": 0.16341153010725976, "rewards/accuracy_reward": 0.2125000050291419, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458432674408, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 596.020849609375, "epoch": 0.5840934549527924, "grad_norm": 0.32353493571281433, "kl": 0.23483212813735008, "learning_rate": 8.812557168597626e-06, "loss": 0.0763, "reward": 1.0645833611488342, "reward_std": 0.1218577940016985, "rewards/accuracy_reward": 0.0875000024214387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833551883698, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 600.0958557128906, "epoch": 0.5844135061609858, "grad_norm": 0.14850178360939026, "kl": 0.2920558929443359, "learning_rate": 8.801460916855194e-06, "loss": 0.0739, "reward": 1.0692708611488342, "reward_std": 0.1560745693743229, "rewards/accuracy_reward": 0.08750000372529029, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9796875178813934, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 609.6604370117187, "epoch": 0.5847335573691791, "grad_norm": 0.15987901389598846, "kl": 0.47453284710645677, "learning_rate": 8.790366162141747e-06, "loss": 0.0701, "reward": 1.1197916984558105, "reward_std": 0.1661427855491638, "rewards/accuracy_reward": 0.1458333371207118, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9697916865348816, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 612.220849609375, "epoch": 0.5850536085773724, "grad_norm": 0.17211389541625977, "kl": 0.44642241299152374, "learning_rate": 8.779272918315135e-06, "loss": 0.0873, "reward": 1.0687500178813933, "reward_std": 0.18413979113101958, "rewards/accuracy_reward": 0.10625000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9625000059604645, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 599.7250183105468, "epoch": 0.5853736597855657, "grad_norm": 0.1202344223856926, "kl": 0.4751662090420723, "learning_rate": 8.768181199231309e-06, "loss": 0.0769, "reward": 1.0192708611488341, "reward_std": 0.12260491922497749, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9713541805744171, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 602.227099609375, "epoch": 0.585693710993759, "grad_norm": 0.26715612411499023, "kl": 0.5222580231726169, "learning_rate": 8.757091018744327e-06, "loss": 0.0813, "reward": 1.1015625476837159, "reward_std": 0.15981225967407225, "rewards/accuracy_reward": 0.1250000052154064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625119209289, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 627.545849609375, "epoch": 0.5860137622019523, "grad_norm": 0.1669369488954544, "kl": 0.35020282939076425, "learning_rate": 8.746002390706318e-06, "loss": 0.0863, "reward": 1.110416704416275, "reward_std": 0.11681613381952047, "rewards/accuracy_reward": 0.1354166718199849, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000178813935, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 623.5604370117187, "epoch": 0.5863338134101457, "grad_norm": 0.15815506875514984, "kl": 0.5524609833955765, "learning_rate": 8.734915328967484e-06, "loss": 0.0783, "reward": 1.0312500178813935, "reward_std": 0.1657247856259346, "rewards/accuracy_reward": 0.07291666995733977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9583333373069763, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 608.268765258789, "epoch": 0.5866538646183389, "grad_norm": 0.09822983294725418, "kl": 0.27492492496967313, "learning_rate": 8.723829847376054e-06, "loss": 0.0452, "reward": 1.042187511920929, "reward_std": 0.12717793975025415, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 598.5625183105469, "epoch": 0.5869739158265322, "grad_norm": 0.08277434855699539, "kl": 0.2869084417819977, "learning_rate": 8.712745959778293e-06, "loss": 0.0608, "reward": 1.0411458611488342, "reward_std": 0.1428369514644146, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791865348816, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 589.0021057128906, "epoch": 0.5872939670347256, "grad_norm": 0.081912100315094, "kl": 0.14179150089621545, "learning_rate": 8.70166368001847e-06, "loss": 0.0047, "reward": 1.176041692495346, "reward_std": 0.09121424276381732, "rewards/accuracy_reward": 0.1854166727513075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.990625011920929, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 602.1062683105469, "epoch": 0.5876140182429188, "grad_norm": 0.10189507901668549, "kl": 0.19410741589963437, "learning_rate": 8.690583021938854e-06, "loss": 0.0435, "reward": 1.107291692495346, "reward_std": 0.1382425595074892, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9906250059604644, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 605.6708587646484, "epoch": 0.5879340694511122, "grad_norm": 0.08415410667657852, "kl": 0.15824654512107372, "learning_rate": 8.679503999379679e-06, "loss": 0.0542, "reward": 1.0067708432674407, "reward_std": 0.12068486250936986, "rewards/accuracy_reward": 0.02708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.979687511920929, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 581.6166870117188, "epoch": 0.5882541206593055, "grad_norm": 0.17711393535137177, "kl": 0.18984725177288056, "learning_rate": 8.66842662617914e-06, "loss": 0.0495, "reward": 1.0437500298023223, "reward_std": 0.116551817022264, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9833333492279053, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 606.8062744140625, "epoch": 0.5885741718674988, "grad_norm": 0.30450427532196045, "kl": 0.3485433362424374, "learning_rate": 8.657350916173376e-06, "loss": 0.0873, "reward": 1.0572916924953462, "reward_std": 0.1278322547674179, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583492279053, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 583.1375122070312, "epoch": 0.5888942230756921, "grad_norm": 0.06369006633758545, "kl": 0.19454945325851442, "learning_rate": 8.646276883196438e-06, "loss": 0.0488, "reward": 1.1505208551883697, "reward_std": 0.0879446528851986, "rewards/accuracy_reward": 0.16458333935588598, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375059604645, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 601.237515258789, "epoch": 0.5892142742838854, "grad_norm": 0.1309588998556137, "kl": 0.36753388717770574, "learning_rate": 8.635204541080297e-06, "loss": 0.0331, "reward": 1.1442708671092987, "reward_std": 0.12082125805318356, "rewards/accuracy_reward": 0.16875000409781932, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9755208432674408, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 611.8687713623046, "epoch": 0.5895343254920787, "grad_norm": 0.06236157566308975, "kl": 0.2872423198074102, "learning_rate": 8.624133903654802e-06, "loss": 0.0428, "reward": 1.043750035762787, "reward_std": 0.14997683018445968, "rewards/accuracy_reward": 0.06250000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.981250011920929, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 570.7250183105468, "epoch": 0.5898543767002721, "grad_norm": 0.07617057114839554, "kl": 0.1260071013122797, "learning_rate": 8.613064984747672e-06, "loss": 0.0206, "reward": 1.0890625238418579, "reward_std": 0.1047076016664505, "rewards/accuracy_reward": 0.09791666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9911458492279053, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 630.8041809082031, "epoch": 0.5901744279084653, "grad_norm": 0.054941531270742416, "kl": 0.1285891652107239, "learning_rate": 8.601997798184486e-06, "loss": 0.0382, "reward": 1.1401041984558105, "reward_std": 0.10463837906718254, "rewards/accuracy_reward": 0.1500000035390258, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041686534882, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 569.7166809082031, "epoch": 0.5904944791166586, "grad_norm": 0.05618637055158615, "kl": 0.19999119341373445, "learning_rate": 8.590932357788652e-06, "loss": 0.0212, "reward": 1.1223958551883697, "reward_std": 0.0716858796775341, "rewards/accuracy_reward": 0.13333333656191826, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9890625059604645, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 584.1625244140625, "epoch": 0.590814530324852, "grad_norm": 0.1865171641111374, "kl": 0.21524617075920105, "learning_rate": 8.5798686773814e-06, "loss": 0.0601, "reward": 1.1328125238418578, "reward_std": 0.11431420799344778, "rewards/accuracy_reward": 0.15416667088866234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458492279053, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 619.3208557128906, "epoch": 0.5911345815330453, "grad_norm": 0.10544977337121964, "kl": 0.16690328232944013, "learning_rate": 8.568806770781769e-06, "loss": 0.0462, "reward": 1.0468750178813935, "reward_std": 0.15823222547769547, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9822916924953461, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 603.977099609375, "epoch": 0.5914546327412386, "grad_norm": 0.13692694902420044, "kl": 0.3568322047591209, "learning_rate": 8.557746651806566e-06, "loss": 0.0936, "reward": 1.0713541984558106, "reward_std": 0.17686703354120253, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9713541805744171, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 592.2041931152344, "epoch": 0.5917746839494319, "grad_norm": 0.07218168675899506, "kl": 0.15120973512530328, "learning_rate": 8.546688334270381e-06, "loss": 0.0523, "reward": 1.108333373069763, "reward_std": 0.10372773855924607, "rewards/accuracy_reward": 0.12083334028720856, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9875000238418579, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 608.3396118164062, "epoch": 0.5920947351576252, "grad_norm": 0.12064994126558304, "kl": 0.28165081068873404, "learning_rate": 8.53563183198555e-06, "loss": 0.0522, "reward": 1.0250000298023223, "reward_std": 0.12865074295550585, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9770833432674408, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 590.3687683105469, "epoch": 0.5924147863658186, "grad_norm": 0.051113247871398926, "kl": 0.16676584184169768, "learning_rate": 8.524577158762137e-06, "loss": 0.0378, "reward": 1.0406250298023223, "reward_std": 0.0711493318900466, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9885416805744172, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 599.1875183105469, "epoch": 0.5927348375740118, "grad_norm": 0.07360915094614029, "kl": 0.13382341675460338, "learning_rate": 8.51352432840792e-06, "loss": 0.0258, "reward": 1.0822916984558106, "reward_std": 0.1298227585852146, "rewards/accuracy_reward": 0.08750000279396772, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9927083432674408, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 598.8396057128906, "epoch": 0.5930548887822051, "grad_norm": 0.08503315597772598, "kl": 0.24217786304652691, "learning_rate": 8.502473354728384e-06, "loss": 0.0298, "reward": 1.1484375298023224, "reward_std": 0.13012812230736018, "rewards/accuracy_reward": 0.16250000353902577, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9859375119209289, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 602.2250213623047, "epoch": 0.5933749399903985, "grad_norm": 0.1039816215634346, "kl": 0.13602565452456475, "learning_rate": 8.491424251526688e-06, "loss": 0.0292, "reward": 1.0276041865348815, "reward_std": 0.04552529603242874, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9901041746139526, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 604.8479431152343, "epoch": 0.5936949911985918, "grad_norm": 0.06977000087499619, "kl": 0.23898737505078316, "learning_rate": 8.480377032603658e-06, "loss": 0.0305, "reward": 1.0536458671092988, "reward_std": 0.08575146589428187, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9869791865348816, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 584.564599609375, "epoch": 0.594015042406785, "grad_norm": 0.0967579260468483, "kl": 0.29310873821377753, "learning_rate": 8.46933171175776e-06, "loss": 0.0658, "reward": 1.160937523841858, "reward_std": 0.15478852652013303, "rewards/accuracy_reward": 0.17291667368263006, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9838541746139526, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 591.5770935058594, "epoch": 0.5943350936149784, "grad_norm": 0.12447807192802429, "kl": 0.2548402227461338, "learning_rate": 8.4582883027851e-06, "loss": 0.0578, "reward": 1.0817708611488341, "reward_std": 0.15226125419139863, "rewards/accuracy_reward": 0.10416666846722364, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9755208432674408, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 644.5750183105469, "epoch": 0.5946551448231717, "grad_norm": 0.1542220264673233, "kl": 0.2618264824151993, "learning_rate": 8.44724681947939e-06, "loss": 0.0396, "reward": 1.0192708492279052, "reward_std": 0.0826049368828535, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9817708492279053, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 594.1791931152344, "epoch": 0.5949751960313651, "grad_norm": 0.23071099817752838, "kl": 0.41815656051039696, "learning_rate": 8.436207275631937e-06, "loss": 0.0822, "reward": 1.003125011920929, "reward_std": 0.13680147156119346, "rewards/accuracy_reward": 0.027083333395421506, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416805744171, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 632.9541870117188, "epoch": 0.5952952472395583, "grad_norm": 0.22107675671577454, "kl": 0.32695833742618563, "learning_rate": 8.425169685031623e-06, "loss": 0.0619, "reward": 1.0614583611488342, "reward_std": 0.12557398490607738, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583551883697, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 611.077099609375, "epoch": 0.5956152984477516, "grad_norm": 0.08938566595315933, "kl": 0.1705992490053177, "learning_rate": 8.414134061464898e-06, "loss": 0.0631, "reward": 1.027604204416275, "reward_std": 0.14426877107471228, "rewards/accuracy_reward": 0.050000001676380634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 636.795849609375, "epoch": 0.595935349655945, "grad_norm": 0.3400190472602844, "kl": 0.5977102071046829, "learning_rate": 8.403100418715743e-06, "loss": 0.0797, "reward": 1.0213542044162751, "reward_std": 0.13304599486291407, "rewards/accuracy_reward": 0.05833333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9630208611488342, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 613.295849609375, "epoch": 0.5962554008641383, "grad_norm": 0.13622289896011353, "kl": 0.3738606728613377, "learning_rate": 8.392068770565675e-06, "loss": 0.0608, "reward": 1.0859375238418578, "reward_std": 0.11104068085551262, "rewards/accuracy_reward": 0.10833333637565375, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9755208492279053, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 597.4291931152344, "epoch": 0.5965754520723315, "grad_norm": 0.16210956871509552, "kl": 0.32467666193842887, "learning_rate": 8.381039130793718e-06, "loss": 0.1061, "reward": 1.051562523841858, "reward_std": 0.1963793769478798, "rewards/accuracy_reward": 0.08333333451300859, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291865348815, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 609.9354309082031, "epoch": 0.5968955032805249, "grad_norm": 0.12312551587820053, "kl": 0.2595462821424007, "learning_rate": 8.370011513176381e-06, "loss": 0.0772, "reward": 1.0911458730697632, "reward_std": 0.10535571686923503, "rewards/accuracy_reward": 0.11250000428408384, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9786458373069763, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 581.1708618164063, "epoch": 0.5972155544887182, "grad_norm": 0.3161362111568451, "kl": 0.3036894164979458, "learning_rate": 8.35898593148766e-06, "loss": 0.0434, "reward": 1.036979192495346, "reward_std": 0.11786015536636114, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9765625238418579, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 589.6125244140625, "epoch": 0.5975356056969116, "grad_norm": 0.14329548180103302, "kl": 0.3524964414536953, "learning_rate": 8.347962399498996e-06, "loss": 0.1037, "reward": 1.0718750357627869, "reward_std": 0.159610353410244, "rewards/accuracy_reward": 0.09791666958481074, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9739583551883697, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 596.6395935058594, "epoch": 0.5978556569051048, "grad_norm": 0.16905654966831207, "kl": 0.3246914021670818, "learning_rate": 8.336940930979275e-06, "loss": 0.0681, "reward": 1.0343750238418579, "reward_std": 0.17898188009858132, "rewards/accuracy_reward": 0.05833333563059569, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9760416746139526, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 609.1625122070312, "epoch": 0.5981757081132981, "grad_norm": 0.2423020452260971, "kl": 0.33233404383063314, "learning_rate": 8.325921539694805e-06, "loss": 0.0671, "reward": 1.193229216337204, "reward_std": 0.2123827485367656, "rewards/accuracy_reward": 0.22083333879709244, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9703125298023224, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 575.6083526611328, "epoch": 0.5984957593214915, "grad_norm": 0.13021445274353027, "kl": 0.2521624334156513, "learning_rate": 8.314904239409295e-06, "loss": 0.0551, "reward": 1.0932291924953461, "reward_std": 0.11757631208747625, "rewards/accuracy_reward": 0.11458333693444729, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9744791865348816, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 594.9771087646484, "epoch": 0.5988158105296848, "grad_norm": 0.1187562495470047, "kl": 0.3661712847650051, "learning_rate": 8.303889043883852e-06, "loss": 0.0749, "reward": 1.1473958611488342, "reward_std": 0.17185933999717234, "rewards/accuracy_reward": 0.1729166703298688, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9744791805744171, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 588.9666870117187, "epoch": 0.599135861737878, "grad_norm": 0.1332739293575287, "kl": 0.34436929896473883, "learning_rate": 8.292875966876947e-06, "loss": 0.0807, "reward": 1.1187500417232514, "reward_std": 0.14176899399608373, "rewards/accuracy_reward": 0.147916672937572, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9708333432674408, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 602.145849609375, "epoch": 0.5994559129460714, "grad_norm": 0.21813121438026428, "kl": 0.27012273371219636, "learning_rate": 8.281865022144403e-06, "loss": 0.0445, "reward": 1.0078125178813935, "reward_std": 0.13042169529944658, "rewards/accuracy_reward": 0.025000000931322576, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9786458551883698, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 609.7396057128906, "epoch": 0.5997759641542647, "grad_norm": 0.21716895699501038, "kl": 0.29259502738714216, "learning_rate": 8.270856223439386e-06, "loss": 0.0442, "reward": 1.0255208671092988, "reward_std": 0.14278821237385272, "rewards/accuracy_reward": 0.04791666902601719, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9776041805744171, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 581.4291870117188, "epoch": 0.600096015362458, "grad_norm": 0.12663888931274414, "kl": 0.33957659900188447, "learning_rate": 8.25984958451238e-06, "loss": 0.0852, "reward": 1.0302083671092988, "reward_std": 0.13524105921387672, "rewards/accuracy_reward": 0.05833333525806665, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9697916805744171, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 597.2229370117187, "epoch": 0.6004160665706513, "grad_norm": 0.1679977923631668, "kl": 0.4340445719659328, "learning_rate": 8.248845119111168e-06, "loss": 0.096, "reward": 1.0390625298023224, "reward_std": 0.15016994029283523, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9682291805744171, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 598.52294921875, "epoch": 0.6007361177788446, "grad_norm": 0.3022070527076721, "kl": 0.3699290931224823, "learning_rate": 8.23784284098082e-06, "loss": 0.11, "reward": 1.0911458492279054, "reward_std": 0.18554365485906602, "rewards/accuracy_reward": 0.12083333898335695, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9682291805744171, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 604.1500244140625, "epoch": 0.601056168987038, "grad_norm": 0.09283298999071121, "kl": 0.35596207827329635, "learning_rate": 8.226842763863675e-06, "loss": 0.0632, "reward": 1.0843750298023225, "reward_std": 0.11838657595217228, "rewards/accuracy_reward": 0.10208333730697632, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9781250119209289, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 616.0000183105469, "epoch": 0.6013762201952313, "grad_norm": 0.15658091008663177, "kl": 0.21277981698513032, "learning_rate": 8.21584490149932e-06, "loss": 0.0449, "reward": 1.080208384990692, "reward_std": 0.1628492258489132, "rewards/accuracy_reward": 0.09791666902601719, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9781250298023224, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 563.5937561035156, "epoch": 0.6016962714034245, "grad_norm": 0.21834583580493927, "kl": 0.41966616809368135, "learning_rate": 8.20484926762458e-06, "loss": 0.0764, "reward": 1.0187500178813935, "reward_std": 0.08752396404743194, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9791666865348816, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 560.5562683105469, "epoch": 0.6020163226116179, "grad_norm": 0.3363422453403473, "kl": 0.26540239825844764, "learning_rate": 8.19385587597349e-06, "loss": 0.0591, "reward": 1.058854204416275, "reward_std": 0.16936119571328162, "rewards/accuracy_reward": 0.0770833345130086, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9776041746139527, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 589.0604431152344, "epoch": 0.6023363738198112, "grad_norm": 0.21959060430526733, "kl": 0.3500664710998535, "learning_rate": 8.182864740277293e-06, "loss": 0.0697, "reward": 1.0619792103767396, "reward_std": 0.13994233533740044, "rewards/accuracy_reward": 0.08333333749324083, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9723958432674408, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 574.7271057128906, "epoch": 0.6026564250280044, "grad_norm": 0.3588142395019531, "kl": 0.6075701117515564, "learning_rate": 8.171875874264408e-06, "loss": 0.1016, "reward": 1.0489583611488342, "reward_std": 0.1500548876821995, "rewards/accuracy_reward": 0.07500000279396772, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9718750178813934, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 588.9437744140625, "epoch": 0.6029764762361978, "grad_norm": 0.16027261316776276, "kl": 0.2999001145362854, "learning_rate": 8.160889291660423e-06, "loss": 0.0407, "reward": 1.152604204416275, "reward_std": 0.1476465906947851, "rewards/accuracy_reward": 0.15833333730697632, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9880208432674408, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 609.5125213623047, "epoch": 0.6032965274443911, "grad_norm": 0.13303539156913757, "kl": 0.3691695436835289, "learning_rate": 8.149905006188067e-06, "loss": 0.0584, "reward": 1.0328125178813934, "reward_std": 0.10183481201529503, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9786458432674408, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 558.1396026611328, "epoch": 0.6036165786525844, "grad_norm": 0.1396128535270691, "kl": 0.39802908822894095, "learning_rate": 8.13892303156721e-06, "loss": 0.0949, "reward": 1.1458333671092986, "reward_std": 0.14314947053790092, "rewards/accuracy_reward": 0.1687500063329935, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9750000059604644, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 598.9104370117187, "epoch": 0.6039366298607777, "grad_norm": 0.20720265805721283, "kl": 0.47100530862808226, "learning_rate": 8.127943381514822e-06, "loss": 0.1295, "reward": 1.0052083432674408, "reward_std": 0.16166542023420333, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9572916746139526, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 572.8812744140625, "epoch": 0.604256681068971, "grad_norm": 0.17445041239261627, "kl": 0.3852656245231628, "learning_rate": 8.116966069744987e-06, "loss": 0.0527, "reward": 1.1286458730697633, "reward_std": 0.1472001016139984, "rewards/accuracy_reward": 0.14583333861082792, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9765625119209289, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 579.8521057128906, "epoch": 0.6045767322771644, "grad_norm": 0.11102991551160812, "kl": 0.26756716668605807, "learning_rate": 8.105991109968846e-06, "loss": 0.0496, "reward": 1.028125023841858, "reward_std": 0.12138459905982017, "rewards/accuracy_reward": 0.03750000167638064, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9843750119209289, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 579.4062683105469, "epoch": 0.6048967834853577, "grad_norm": 0.10683989524841309, "kl": 0.3563895784318447, "learning_rate": 8.095018515894633e-06, "loss": 0.0677, "reward": 1.0942708671092987, "reward_std": 0.16333364136517048, "rewards/accuracy_reward": 0.11250000279396773, "rewards/format_reward": 0.01250000037252903, "rewards/tag_count_reward": 0.9692708492279053, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 580.8750183105469, "epoch": 0.6052168346935509, "grad_norm": 0.10681261867284775, "kl": 0.2178695060312748, "learning_rate": 8.084048301227597e-06, "loss": 0.0632, "reward": 1.0796875357627869, "reward_std": 0.13646226227283478, "rewards/accuracy_reward": 0.09166667144745588, "rewards/format_reward": 0.00416666679084301, "rewards/tag_count_reward": 0.9838541746139526, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 571.268765258789, "epoch": 0.6055368859017443, "grad_norm": 0.1127118170261383, "kl": 0.1855003535747528, "learning_rate": 8.073080479670033e-06, "loss": 0.0417, "reward": 1.0666666805744172, "reward_std": 0.1344422660768032, "rewards/accuracy_reward": 0.08333333600312472, "rewards/format_reward": 0.00833333358168602, "rewards/tag_count_reward": 0.975000011920929, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 575.2666809082032, "epoch": 0.6058569371099376, "grad_norm": 0.21781377494335175, "kl": 0.4310750551521778, "learning_rate": 8.062115064921235e-06, "loss": 0.0905, "reward": 1.079687523841858, "reward_std": 0.14400339033454657, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": 0.01458333358168602, "rewards/tag_count_reward": 0.9734375178813934, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 594.6437744140625, "epoch": 0.6061769883181309, "grad_norm": 0.2137860506772995, "kl": 0.3845756992697716, "learning_rate": 8.051152070677504e-06, "loss": 0.1213, "reward": 1.0130208671092986, "reward_std": 0.19951648712158204, "rewards/accuracy_reward": 0.04583333414047956, "rewards/format_reward": 0.010416666977107525, "rewards/tag_count_reward": 0.9567708611488343, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 579.7833435058594, "epoch": 0.6064970395263242, "grad_norm": 0.15622647106647491, "kl": 0.4282746434211731, "learning_rate": 8.040191510632105e-06, "loss": 0.1073, "reward": 1.0755208671092986, "reward_std": 0.20779597759246826, "rewards/accuracy_reward": 0.10416666902601719, "rewards/format_reward": 0.002083333395421505, "rewards/tag_count_reward": 0.9692708611488342, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 568.120849609375, "epoch": 0.6068170907345175, "grad_norm": 0.4617317020893097, "kl": 0.4920196183025837, "learning_rate": 8.02923339847527e-06, "loss": 0.0567, "reward": 1.1208333551883698, "reward_std": 0.16462844759225845, "rewards/accuracy_reward": 0.1479166716337204, "rewards/format_reward": 0.006250000186264515, "rewards/tag_count_reward": 0.9666666746139526, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 581.9041870117187, "epoch": 0.6071371419427108, "grad_norm": 0.34303173422813416, "kl": 0.2250536672770977, "learning_rate": 8.018277747894178e-06, "loss": 0.0734, "reward": 1.041666680574417, "reward_std": 0.16085805594921113, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9750000059604644, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 606.177099609375, "epoch": 0.6074571931509042, "grad_norm": 0.23493434488773346, "kl": 0.6605220437049866, "learning_rate": 8.007324572572915e-06, "loss": 0.139, "reward": 1.0619792103767396, "reward_std": 0.23571573868393897, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.01666666716337204, "rewards/tag_count_reward": 0.9432291865348816, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 593.6937683105468, "epoch": 0.6077772443590974, "grad_norm": 0.2770790755748749, "kl": 0.7114527821540833, "learning_rate": 7.996373886192496e-06, "loss": 0.1715, "reward": 1.0718750476837158, "reward_std": 0.2413769096136093, "rewards/accuracy_reward": 0.10625000428408385, "rewards/format_reward": 0.03750000111758709, "rewards/tag_count_reward": 0.9281250178813935, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 562.602099609375, "epoch": 0.6080972955672908, "grad_norm": 0.7002983689308167, "kl": 0.8839252561330795, "learning_rate": 7.985425702430821e-06, "loss": 0.1461, "reward": 1.0312500238418578, "reward_std": 0.17215485386550428, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.016666667349636555, "rewards/tag_count_reward": 0.9562500178813934, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 568.7437713623046, "epoch": 0.6084173467754841, "grad_norm": 0.5294923782348633, "kl": 1.1238539427518845, "learning_rate": 7.974480034962655e-06, "loss": 0.1987, "reward": 1.046354204416275, "reward_std": 0.2591739296913147, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.043750001676380636, "rewards/tag_count_reward": 0.9192708551883697, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 585.6437744140625, "epoch": 0.6087373979836774, "grad_norm": 0.22682341933250427, "kl": 0.9062155365943909, "learning_rate": 7.96353689745963e-06, "loss": 0.1727, "reward": 1.0520833492279054, "reward_std": 0.2488498877733946, "rewards/accuracy_reward": 0.09375000130385161, "rewards/format_reward": 0.025000000931322576, "rewards/tag_count_reward": 0.9333333492279052, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 572.6833557128906, "epoch": 0.6090574491918707, "grad_norm": 0.234841451048851, "kl": 0.6701559334993362, "learning_rate": 7.952596303590215e-06, "loss": 0.1411, "reward": 1.0953125357627869, "reward_std": 0.23830147199332713, "rewards/accuracy_reward": 0.14375000428408385, "rewards/format_reward": 0.02291666716337204, "rewards/tag_count_reward": 0.9286458432674408, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 563.452099609375, "epoch": 0.609377500400064, "grad_norm": 0.21650651097297668, "kl": 0.7510048195719718, "learning_rate": 7.9416582670197e-06, "loss": 0.1216, "reward": 1.2067708730697633, "reward_std": 0.28640821799635885, "rewards/accuracy_reward": 0.18958333786576986, "rewards/format_reward": 0.08333333432674409, "rewards/tag_count_reward": 0.9338541865348816, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 558.0521118164063, "epoch": 0.6096975516082573, "grad_norm": 0.5106697678565979, "kl": 0.5842062175273895, "learning_rate": 7.930722801410184e-06, "loss": 0.1217, "reward": 0.9880208492279052, "reward_std": 0.2233804479241371, "rewards/accuracy_reward": 0.01458333432674408, "rewards/format_reward": 0.045833333395421505, "rewards/tag_count_reward": 0.9276041805744171, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 563.2041900634765, "epoch": 0.6100176028164507, "grad_norm": 0.3872288763523102, "kl": 0.8763932049274444, "learning_rate": 7.91978992042055e-06, "loss": 0.1593, "reward": 1.0270833551883698, "reward_std": 0.2873599737882614, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.027083333767950534, "rewards/tag_count_reward": 0.9145833551883698, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 575.3521057128906, "epoch": 0.6103376540246439, "grad_norm": 0.26322004199028015, "kl": 0.8161401003599167, "learning_rate": 7.90885963770646e-06, "loss": 0.1386, "reward": 1.0750000536441804, "reward_std": 0.2801113411784172, "rewards/accuracy_reward": 0.10833333786576986, "rewards/format_reward": 0.05416666846722364, "rewards/tag_count_reward": 0.9125000238418579, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 604.8396118164062, "epoch": 0.6106577052328372, "grad_norm": 0.29171696305274963, "kl": 1.0318253219127655, "learning_rate": 7.89793196692033e-06, "loss": 0.1855, "reward": 1.0354166865348815, "reward_std": 0.26935647130012513, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.03750000111758709, "rewards/tag_count_reward": 0.9083333432674408, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 546.5166900634765, "epoch": 0.6109777564410306, "grad_norm": 0.6068216562271118, "kl": 1.0604799330234527, "learning_rate": 7.887006921711301e-06, "loss": 0.176, "reward": 1.0729166924953462, "reward_std": 0.29732812345027926, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.11875000447034836, "rewards/tag_count_reward": 0.9020833492279052, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 571.8021057128906, "epoch": 0.6112978076492239, "grad_norm": 0.15164311230182648, "kl": 0.815485092997551, "learning_rate": 7.876084515725248e-06, "loss": 0.1439, "reward": 1.0776041984558105, "reward_std": 0.26183063685894015, "rewards/accuracy_reward": 0.05208333618938923, "rewards/format_reward": 0.11458333488553762, "rewards/tag_count_reward": 0.9109375178813934, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 547.3104370117187, "epoch": 0.6116178588574172, "grad_norm": 0.16586171090602875, "kl": 0.7920496329665184, "learning_rate": 7.865164762604749e-06, "loss": 0.1445, "reward": 1.0729166924953462, "reward_std": 0.19199963212013244, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.05208333358168602, "rewards/tag_count_reward": 0.9354166865348816, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 572.5645965576172, "epoch": 0.6119379100656105, "grad_norm": 0.1570732742547989, "kl": 0.758229385316372, "learning_rate": 7.854247675989057e-06, "loss": 0.1651, "reward": 1.0875000298023223, "reward_std": 0.21725860238075256, "rewards/accuracy_reward": 0.1250000050291419, "rewards/format_reward": 0.025000000186264516, "rewards/tag_count_reward": 0.9375000119209289, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 548.1791778564453, "epoch": 0.6122579612738038, "grad_norm": 0.20079578459262848, "kl": 0.49525588750839233, "learning_rate": 7.84333326951411e-06, "loss": 0.1341, "reward": 1.123437523841858, "reward_std": 0.2613053783774376, "rewards/accuracy_reward": 0.07916666679084301, "rewards/format_reward": 0.11250000335276127, "rewards/tag_count_reward": 0.9317708492279053, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 527.6000183105468, "epoch": 0.6125780124819972, "grad_norm": 0.11362726986408234, "kl": 0.5724338337779045, "learning_rate": 7.83242155681248e-06, "loss": 0.1292, "reward": 1.1770833551883697, "reward_std": 0.24515315815806388, "rewards/accuracy_reward": 0.0854166679084301, "rewards/format_reward": 0.17708333805203438, "rewards/tag_count_reward": 0.9145833492279053, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 533.677099609375, "epoch": 0.6128980636901904, "grad_norm": 0.295296311378479, "kl": 0.3709353081882, "learning_rate": 7.821512551513395e-06, "loss": 0.1497, "reward": 1.2864583730697632, "reward_std": 0.3537163957953453, "rewards/accuracy_reward": 0.17708333861082792, "rewards/format_reward": 0.22291667349636554, "rewards/tag_count_reward": 0.8864583551883698, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 534.900015258789, "epoch": 0.6132181148983837, "grad_norm": 0.10389820486307144, "kl": 0.4146296791732311, "learning_rate": 7.810606267242687e-06, "loss": 0.154, "reward": 1.1968750476837158, "reward_std": 0.3052997462451458, "rewards/accuracy_reward": 0.1166666703298688, "rewards/format_reward": 0.16875000316649674, "rewards/tag_count_reward": 0.9114583492279053, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 531.7396026611328, "epoch": 0.6135381661065771, "grad_norm": 0.08782447874546051, "kl": 0.4644440606236458, "learning_rate": 7.799702717622796e-06, "loss": 0.0892, "reward": 1.2354167103767395, "reward_std": 0.237884309142828, "rewards/accuracy_reward": 0.10416667070239782, "rewards/format_reward": 0.2208333408460021, "rewards/tag_count_reward": 0.9104166865348816, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 573.431265258789, "epoch": 0.6138582173147704, "grad_norm": 0.26354876160621643, "kl": 0.7795211791992187, "learning_rate": 7.788801916272739e-06, "loss": 0.15, "reward": 1.1328125417232513, "reward_std": 0.3334435373544693, "rewards/accuracy_reward": 0.050000001303851606, "rewards/format_reward": 0.1916666716337204, "rewards/tag_count_reward": 0.8911458492279053, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 558.6041870117188, "epoch": 0.6141782685229636, "grad_norm": 0.23324386775493622, "kl": 0.7294996976852417, "learning_rate": 7.77790387680811e-06, "loss": 0.1578, "reward": 1.2291667103767394, "reward_std": 0.39393016397953035, "rewards/accuracy_reward": 0.06250000111758709, "rewards/format_reward": 0.3125000074505806, "rewards/tag_count_reward": 0.8541666865348816, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 560.5208435058594, "epoch": 0.614498319731157, "grad_norm": 0.2352185994386673, "kl": 0.8153878182172776, "learning_rate": 7.767008612841045e-06, "loss": 0.1319, "reward": 1.1875000476837159, "reward_std": 0.3735229402780533, "rewards/accuracy_reward": 0.05208333525806665, "rewards/format_reward": 0.26041667386889455, "rewards/tag_count_reward": 0.8750000238418579, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 552.5416809082031, "epoch": 0.6148183709393503, "grad_norm": 0.37710386514663696, "kl": 0.8285433441400528, "learning_rate": 7.75611613798022e-06, "loss": 0.1573, "reward": 1.3359375476837159, "reward_std": 0.46349499821662904, "rewards/accuracy_reward": 0.17083333637565373, "rewards/format_reward": 0.310416679084301, "rewards/tag_count_reward": 0.8546875178813934, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 539.3041870117188, "epoch": 0.6151384221475437, "grad_norm": 0.21071451902389526, "kl": 0.8673926889896393, "learning_rate": 7.745226465830817e-06, "loss": 0.1906, "reward": 1.3776042103767394, "reward_std": 0.4966884583234787, "rewards/accuracy_reward": 0.11458333563059568, "rewards/format_reward": 0.450000011920929, "rewards/tag_count_reward": 0.8130208551883698, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 514.7416839599609, "epoch": 0.6154584733557369, "grad_norm": 0.13909560441970825, "kl": 0.46110083162784576, "learning_rate": 7.734339609994527e-06, "loss": 0.0851, "reward": 1.3984375596046448, "reward_std": 0.43087140917778016, "rewards/accuracy_reward": 0.025000000186264516, "rewards/format_reward": 0.5583333432674408, "rewards/tag_count_reward": 0.8151041865348816, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 556.8729309082031, "epoch": 0.6157785245639302, "grad_norm": 0.2250901609659195, "kl": 0.4949187658727169, "learning_rate": 7.723455584069524e-06, "loss": 0.1126, "reward": 1.5614583611488342, "reward_std": 0.45720491707324984, "rewards/accuracy_reward": 0.10208333507180214, "rewards/format_reward": 0.6750000208616257, "rewards/tag_count_reward": 0.7843750178813934, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 551.2750183105469, "epoch": 0.6160985757721236, "grad_norm": 0.5836819410324097, "kl": 0.4990528032183647, "learning_rate": 7.712574401650445e-06, "loss": 0.1638, "reward": 1.677083384990692, "reward_std": 0.40421550869941714, "rewards/accuracy_reward": 0.16250000800937414, "rewards/format_reward": 0.7541666924953461, "rewards/tag_count_reward": 0.7604166805744171, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 557.9500183105469, "epoch": 0.6164186269803168, "grad_norm": 0.6810193061828613, "kl": 0.6582358777523041, "learning_rate": 7.701696076328368e-06, "loss": 0.1681, "reward": 1.634895884990692, "reward_std": 0.49258761405944823, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.7833333611488342, "rewards/tag_count_reward": 0.7390625298023223, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 576.8208526611328, "epoch": 0.6167386781885101, "grad_norm": 0.5494832396507263, "kl": 0.7763916999101639, "learning_rate": 7.690820621690815e-06, "loss": 0.1614, "reward": 1.5572917103767394, "reward_std": 0.46946349143981936, "rewards/accuracy_reward": 0.1166666692122817, "rewards/format_reward": 0.6833333551883698, "rewards/tag_count_reward": 0.7572916865348815, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 572.1000213623047, "epoch": 0.6170587293967035, "grad_norm": 0.17400583624839783, "kl": 0.6718953251838684, "learning_rate": 7.679948051321708e-06, "loss": 0.2013, "reward": 1.6239583849906922, "reward_std": 0.43792185485363005, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.8395833551883698, "rewards/tag_count_reward": 0.7218750178813934, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 559.5145935058594, "epoch": 0.6173787806048968, "grad_norm": 0.24433894455432892, "kl": 0.6602077126502991, "learning_rate": 7.66907837880138e-06, "loss": 0.1903, "reward": 1.645312535762787, "reward_std": 0.4145914763212204, "rewards/accuracy_reward": 0.05416666734963656, "rewards/format_reward": 0.8729166805744171, "rewards/tag_count_reward": 0.7182291865348815, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 521.279183959961, "epoch": 0.61769883181309, "grad_norm": 0.20922113955020905, "kl": 0.6923602253198624, "learning_rate": 7.65821161770654e-06, "loss": 0.1745, "reward": 1.7348958611488343, "reward_std": 0.34830624908208846, "rewards/accuracy_reward": 0.12500000260770322, "rewards/format_reward": 0.8791666865348816, "rewards/tag_count_reward": 0.7307291924953461, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 527.6875274658203, "epoch": 0.6180188830212834, "grad_norm": 0.2695390284061432, "kl": 0.9710577547550201, "learning_rate": 7.64734778161025e-06, "loss": 0.2045, "reward": 1.6416667103767395, "reward_std": 0.40317414700984955, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.8562500238418579, "rewards/tag_count_reward": 0.7083333551883697, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 520.702099609375, "epoch": 0.6183389342294767, "grad_norm": 0.2840893268585205, "kl": 0.859143078327179, "learning_rate": 7.636486884081937e-06, "loss": 0.1529, "reward": 1.7026041865348815, "reward_std": 0.35204153060913085, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.8958333611488343, "rewards/tag_count_reward": 0.7213541865348816, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 509.195849609375, "epoch": 0.6186589854376701, "grad_norm": 0.18182797729969025, "kl": 0.4233370810747147, "learning_rate": 7.625628938687349e-06, "loss": 0.143, "reward": 1.802083384990692, "reward_std": 0.2987508878111839, "rewards/accuracy_reward": 0.15625000428408384, "rewards/format_reward": 0.9187500178813934, "rewards/tag_count_reward": 0.7270833551883698, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 534.0687652587891, "epoch": 0.6189790366458633, "grad_norm": 0.4376506805419922, "kl": 1.092165270447731, "learning_rate": 7.614773958988539e-06, "loss": 0.2685, "reward": 1.6338541984558106, "reward_std": 0.3932381421327591, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.887500011920929, "rewards/tag_count_reward": 0.7088541865348816, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 524.0875091552734, "epoch": 0.6192990878540566, "grad_norm": 0.36206111311912537, "kl": 0.5829787597060203, "learning_rate": 7.6039219585438676e-06, "loss": 0.1531, "reward": 1.6567708730697632, "reward_std": 0.31807751953601837, "rewards/accuracy_reward": 0.020833334140479564, "rewards/format_reward": 0.9104166924953461, "rewards/tag_count_reward": 0.7255208611488342, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 535.9791870117188, "epoch": 0.61961913906225, "grad_norm": 0.37123608589172363, "kl": 0.6474206149578094, "learning_rate": 7.593072950907969e-06, "loss": 0.1671, "reward": 1.6927083611488343, "reward_std": 0.3547057643532753, "rewards/accuracy_reward": 0.05833333469927311, "rewards/format_reward": 0.9125000298023224, "rewards/tag_count_reward": 0.721875011920929, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 526.4437622070312, "epoch": 0.6199391902704433, "grad_norm": 0.2424151450395584, "kl": 0.39393432140350343, "learning_rate": 7.582226949631737e-06, "loss": 0.1283, "reward": 1.7213542222976685, "reward_std": 0.25976524502038956, "rewards/accuracy_reward": 0.050000001303851606, "rewards/format_reward": 0.9416666924953461, "rewards/tag_count_reward": 0.7296875238418579, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 529.4000213623046, "epoch": 0.6202592414786365, "grad_norm": 0.1958591789007187, "kl": 0.5948910281062126, "learning_rate": 7.571383968262317e-06, "loss": 0.1273, "reward": 1.8114583611488342, "reward_std": 0.2821238741278648, "rewards/accuracy_reward": 0.1645833384245634, "rewards/format_reward": 0.9250000238418579, "rewards/tag_count_reward": 0.7218750298023224, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 504.9333526611328, "epoch": 0.6205792926868299, "grad_norm": 0.17749886214733124, "kl": 0.2720214515924454, "learning_rate": 7.560544020343071e-06, "loss": 0.0659, "reward": 1.8145833611488342, "reward_std": 0.1834358900785446, "rewards/accuracy_reward": 0.10625000409781933, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7458333551883698, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 535.8000183105469, "epoch": 0.6208993438950232, "grad_norm": 0.25111111998558044, "kl": 0.45597586706280707, "learning_rate": 7.5497071194135875e-06, "loss": 0.1531, "reward": 1.692187535762787, "reward_std": 0.27718111127614975, "rewards/accuracy_reward": 0.03333333451300859, "rewards/format_reward": 0.931250023841858, "rewards/tag_count_reward": 0.727604192495346, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 520.0354309082031, "epoch": 0.6212193951032166, "grad_norm": 0.17664361000061035, "kl": 0.3269217021763325, "learning_rate": 7.538873279009637e-06, "loss": 0.1029, "reward": 1.7447916984558105, "reward_std": 0.23195823952555655, "rewards/accuracy_reward": 0.05625000055879355, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7427083492279053, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 538.0333404541016, "epoch": 0.6215394463114098, "grad_norm": 0.18127109110355377, "kl": 0.3480118840932846, "learning_rate": 7.528042512663174e-06, "loss": 0.0889, "reward": 1.6937500357627868, "reward_std": 0.2795826196670532, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9166666865348816, "rewards/tag_count_reward": 0.743750023841858, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 529.0354309082031, "epoch": 0.6218594975196031, "grad_norm": 0.1579395979642868, "kl": 0.2698191873729229, "learning_rate": 7.517214833902307e-06, "loss": 0.0553, "reward": 1.882812535762787, "reward_std": 0.16483215913176535, "rewards/accuracy_reward": 0.16458333637565375, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.745312511920929, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 523.779183959961, "epoch": 0.6221795487277965, "grad_norm": 0.17572399973869324, "kl": 0.28989646807312963, "learning_rate": 7.506390256251294e-06, "loss": 0.0957, "reward": 1.7708333611488343, "reward_std": 0.21848259344697, "rewards/accuracy_reward": 0.07083333525806665, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7395833611488343, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 527.0583465576171, "epoch": 0.6224995999359898, "grad_norm": 0.2514660060405731, "kl": 0.25404314175248144, "learning_rate": 7.495568793230516e-06, "loss": 0.0653, "reward": 1.7812500476837159, "reward_std": 0.1805768422782421, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7395833492279053, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 531.3854309082031, "epoch": 0.622819651144183, "grad_norm": 0.09252886474132538, "kl": 0.20499701499938966, "learning_rate": 7.484750458356467e-06, "loss": 0.0657, "reward": 1.8531250476837158, "reward_std": 0.25213652551174165, "rewards/accuracy_reward": 0.15625000335276126, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7364583551883698, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 539.2666870117188, "epoch": 0.6231397023523764, "grad_norm": 0.12097577005624771, "kl": 0.39013560861349106, "learning_rate": 7.47393526514173e-06, "loss": 0.0894, "reward": 1.754687535762787, "reward_std": 0.23477007076144218, "rewards/accuracy_reward": 0.06250000167638063, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 540.0771118164063, "epoch": 0.6234597535605697, "grad_norm": 0.2490178346633911, "kl": 0.3661611221730709, "learning_rate": 7.463123227094962e-06, "loss": 0.0698, "reward": 1.8302083611488342, "reward_std": 0.17300696298480034, "rewards/accuracy_reward": 0.12291666977107525, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7385416924953461, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 542.6500122070313, "epoch": 0.623779804768763, "grad_norm": 0.151083305478096, "kl": 0.30013838559389117, "learning_rate": 7.452314357720888e-06, "loss": 0.094, "reward": 1.7109375238418578, "reward_std": 0.22202819362282752, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7359375178813934, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 528.785433959961, "epoch": 0.6240998559769563, "grad_norm": 0.1321251392364502, "kl": 0.3549846962094307, "learning_rate": 7.441508670520271e-06, "loss": 0.1134, "reward": 1.7364583611488342, "reward_std": 0.14900253042578698, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7364583492279053, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 526.3896026611328, "epoch": 0.6244199071851496, "grad_norm": 0.15130139887332916, "kl": 0.3252772256731987, "learning_rate": 7.430706178989895e-06, "loss": 0.0679, "reward": 1.784375047683716, "reward_std": 0.1683867707848549, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7385416805744172, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 531.220849609375, "epoch": 0.624739958393343, "grad_norm": 0.11556895822286606, "kl": 0.2892810679972172, "learning_rate": 7.419906896622556e-06, "loss": 0.1023, "reward": 1.7937500596046447, "reward_std": 0.2092138223350048, "rewards/accuracy_reward": 0.0895833358168602, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7395833492279053, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 516.4854339599609, "epoch": 0.6250600096015363, "grad_norm": 0.10080796480178833, "kl": 0.22869173735380172, "learning_rate": 7.409110836907041e-06, "loss": 0.0724, "reward": 1.8031250596046449, "reward_std": 0.1844940721988678, "rewards/accuracy_reward": 0.08750000186264514, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7427083551883698, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 549.5729370117188, "epoch": 0.6253800608097295, "grad_norm": 0.10239440947771072, "kl": 0.32757807746529577, "learning_rate": 7.398318013328112e-06, "loss": 0.1184, "reward": 1.7677083969116212, "reward_std": 0.23693208321928977, "rewards/accuracy_reward": 0.08333333451300859, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7302083551883698, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 531.2791778564454, "epoch": 0.6257001120179229, "grad_norm": 0.08784143626689911, "kl": 0.23789920881390572, "learning_rate": 7.387528439366491e-06, "loss": 0.089, "reward": 1.7901042103767395, "reward_std": 0.19504887610673904, "rewards/accuracy_reward": 0.08125000353902578, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7421875178813935, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 559.0375244140625, "epoch": 0.6260201632261162, "grad_norm": 0.09152630716562271, "kl": 0.23061190843582152, "learning_rate": 7.376742128498835e-06, "loss": 0.0863, "reward": 1.8093750119209289, "reward_std": 0.20874695628881454, "rewards/accuracy_reward": 0.1062500026077032, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7385416805744172, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 524.7562622070312, "epoch": 0.6263402144343095, "grad_norm": 0.07687767595052719, "kl": 0.2703046713024378, "learning_rate": 7.365959094197734e-06, "loss": 0.0544, "reward": 1.8505208492279053, "reward_std": 0.15929230451583862, "rewards/accuracy_reward": 0.1395833373069763, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7421875178813935, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 565.7083618164063, "epoch": 0.6266602656425028, "grad_norm": 0.05818747729063034, "kl": 0.14957574531435966, "learning_rate": 7.35517934993168e-06, "loss": 0.0521, "reward": 1.7531250357627868, "reward_std": 0.15787961557507516, "rewards/accuracy_reward": 0.03125000074505806, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7427083432674408, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 541.4000183105469, "epoch": 0.6269803168506961, "grad_norm": 0.1020335853099823, "kl": 0.3409750394523144, "learning_rate": 7.344402909165053e-06, "loss": 0.0373, "reward": 1.7281250357627869, "reward_std": 0.1918536826968193, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7406250178813935, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 539.8187713623047, "epoch": 0.6273003680588894, "grad_norm": 0.09709202498197556, "kl": 0.25619880557060243, "learning_rate": 7.3336297853581115e-06, "loss": 0.0861, "reward": 1.7859375476837158, "reward_std": 0.18421316221356393, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.9666666984558105, "rewards/tag_count_reward": 0.7380208611488343, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 577.2333557128907, "epoch": 0.6276204192670828, "grad_norm": 0.32538357377052307, "kl": 0.4842786967754364, "learning_rate": 7.322859991966973e-06, "loss": 0.0732, "reward": 1.7713542103767395, "reward_std": 0.15907796677201985, "rewards/accuracy_reward": 0.05833333488553762, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7442708551883698, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 537.0833557128906, "epoch": 0.627940470475276, "grad_norm": 0.20227976143360138, "kl": 0.24760584570467473, "learning_rate": 7.3120935424435856e-06, "loss": 0.0577, "reward": 1.770312535762787, "reward_std": 0.15020546615123748, "rewards/accuracy_reward": 0.05833333488553762, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.745312511920929, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 517.2000152587891, "epoch": 0.6282605216834694, "grad_norm": 0.12068881839513779, "kl": 0.23002145811915398, "learning_rate": 7.301330450235733e-06, "loss": 0.0691, "reward": 1.7859375596046447, "reward_std": 0.20177326947450638, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7421875178813935, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 547.9041809082031, "epoch": 0.6285805728916627, "grad_norm": 0.08962026238441467, "kl": 0.269534295797348, "learning_rate": 7.290570728786992e-06, "loss": 0.0775, "reward": 1.7885417103767396, "reward_std": 0.19415293484926224, "rewards/accuracy_reward": 0.08750000372529029, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7385416865348816, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 543.2771026611329, "epoch": 0.628900624099856, "grad_norm": 0.04769608750939369, "kl": 0.21004538014531135, "learning_rate": 7.279814391536744e-06, "loss": 0.015, "reward": 1.8588542342185974, "reward_std": 0.13069503456354142, "rewards/accuracy_reward": 0.13333333544433118, "rewards/format_reward": 0.9791666746139527, "rewards/tag_count_reward": 0.7463541686534881, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 548.3083557128906, "epoch": 0.6292206753080493, "grad_norm": 0.0883219763636589, "kl": 0.14996635988354684, "learning_rate": 7.2690614519201315e-06, "loss": 0.0513, "reward": 1.8463542103767394, "reward_std": 0.1668414853513241, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7401041865348816, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 570.1729370117188, "epoch": 0.6295407265162426, "grad_norm": 0.10870802402496338, "kl": 0.22853438630700112, "learning_rate": 7.258311923368062e-06, "loss": 0.0778, "reward": 1.8182292103767395, "reward_std": 0.2189410574734211, "rewards/accuracy_reward": 0.11666667070239782, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7390625298023223, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 519.5916809082031, "epoch": 0.6298607777244359, "grad_norm": 2.5341570377349854, "kl": 0.30130406394600867, "learning_rate": 7.247565819307172e-06, "loss": 0.0612, "reward": 1.808333396911621, "reward_std": 0.18026887029409408, "rewards/accuracy_reward": 0.09791667014360428, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7416666865348815, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 568.8312713623047, "epoch": 0.6301808289326292, "grad_norm": 0.7344006299972534, "kl": 0.7502999603748322, "learning_rate": 7.236823153159832e-06, "loss": 0.1116, "reward": 1.283854216337204, "reward_std": 0.47162422239780427, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.49375001192092893, "rewards/tag_count_reward": 0.7505208492279053, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 628.1062622070312, "epoch": 0.6305008801408225, "grad_norm": 1.0056172609329224, "kl": 1.3431001484394074, "learning_rate": 7.226083938344108e-06, "loss": 0.1292, "reward": 1.2906250357627869, "reward_std": 0.49476856291294097, "rewards/accuracy_reward": 0.05416666883975267, "rewards/format_reward": 0.4625000149011612, "rewards/tag_count_reward": 0.7739583671092987, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 595.4312622070313, "epoch": 0.6308209313490158, "grad_norm": 0.4282771646976471, "kl": 0.5381355553865432, "learning_rate": 7.215348188273768e-06, "loss": 0.0965, "reward": 1.5119791984558106, "reward_std": 0.46053238213062286, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.7354166805744171, "rewards/tag_count_reward": 0.7578125178813935, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 561.3479370117187, "epoch": 0.6311409825572092, "grad_norm": 0.1805924028158188, "kl": 0.44332694709300996, "learning_rate": 7.204615916358234e-06, "loss": 0.1127, "reward": 1.722395896911621, "reward_std": 0.33906579315662383, "rewards/accuracy_reward": 0.1354166716337204, "rewards/format_reward": 0.8291666865348816, "rewards/tag_count_reward": 0.7578125238418579, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 554.2000183105469, "epoch": 0.6314610337654024, "grad_norm": 0.21334503591060638, "kl": 0.3289230242371559, "learning_rate": 7.193887136002599e-06, "loss": 0.0996, "reward": 1.7250000476837157, "reward_std": 0.27289713025093076, "rewards/accuracy_reward": 0.07916666902601718, "rewards/format_reward": 0.8875000178813934, "rewards/tag_count_reward": 0.7583333611488342, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 566.633349609375, "epoch": 0.6317810849735958, "grad_norm": 0.16588036715984344, "kl": 0.34746977835893633, "learning_rate": 7.183161860607592e-06, "loss": 0.0993, "reward": 1.7114583611488343, "reward_std": 0.3407271146774292, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.8625000238418579, "rewards/tag_count_reward": 0.7552083551883697, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 592.1104370117188, "epoch": 0.6321011361817891, "grad_norm": 0.4449247717857361, "kl": 0.354316396266222, "learning_rate": 7.172440103569566e-06, "loss": 0.1278, "reward": 1.7458333849906922, "reward_std": 0.26184590086340903, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.8937500178813934, "rewards/tag_count_reward": 0.7541666865348816, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 595.2104370117188, "epoch": 0.6324211873899824, "grad_norm": 0.343107134103775, "kl": 0.3772110417485237, "learning_rate": 7.161721878280467e-06, "loss": 0.1068, "reward": 1.7208333730697631, "reward_std": 0.30226452052593233, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.8791666924953461, "rewards/tag_count_reward": 0.7541666865348816, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 586.1625122070312, "epoch": 0.6327412385981757, "grad_norm": 0.5401050448417664, "kl": 0.32526273727416993, "learning_rate": 7.151007198127844e-06, "loss": 0.1215, "reward": 1.7130208611488342, "reward_std": 0.2510280154645443, "rewards/accuracy_reward": 0.03958333414047956, "rewards/format_reward": 0.9270833432674408, "rewards/tag_count_reward": 0.7463541746139526, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 555.6208587646485, "epoch": 0.633061289806369, "grad_norm": 0.34722357988357544, "kl": 0.4616437517106533, "learning_rate": 7.140296076494809e-06, "loss": 0.0943, "reward": 1.7385416984558106, "reward_std": 0.24983570948243142, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.9333333492279052, "rewards/tag_count_reward": 0.7468750178813934, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 561.008349609375, "epoch": 0.6333813410145623, "grad_norm": 0.18879617750644684, "kl": 0.3938341312110424, "learning_rate": 7.129588526760036e-06, "loss": 0.0789, "reward": 1.7145833730697633, "reward_std": 0.3124412089586258, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9125000238418579, "rewards/tag_count_reward": 0.7395833551883697, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 571.4187591552734, "epoch": 0.6337013922227557, "grad_norm": 0.45781409740448, "kl": 0.4807680070400238, "learning_rate": 7.11888456229773e-06, "loss": 0.0984, "reward": 1.6822917103767394, "reward_std": 0.3750412121415138, "rewards/accuracy_reward": 0.04791666753590107, "rewards/format_reward": 0.8979166865348815, "rewards/tag_count_reward": 0.7364583492279053, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 744.9520935058594, "epoch": 0.6340214434309489, "grad_norm": 9.180135726928711, "kl": 2.3556045293807983, "learning_rate": 7.108184196477622e-06, "loss": 0.2895, "reward": 1.3651041984558105, "reward_std": 0.4459951549768448, "rewards/accuracy_reward": 0.04791666902601719, "rewards/format_reward": 0.48541668355464934, "rewards/tag_count_reward": 0.8317708611488343, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 561.0333557128906, "epoch": 0.6343414946391422, "grad_norm": 0.34384164214134216, "kl": 0.6166922569274902, "learning_rate": 7.097487442664952e-06, "loss": 0.1315, "reward": 1.853645884990692, "reward_std": 0.30587767958641054, "rewards/accuracy_reward": 0.20416667461395263, "rewards/format_reward": 0.9187500178813934, "rewards/tag_count_reward": 0.7307291865348816, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 537.9729309082031, "epoch": 0.6346615458473356, "grad_norm": 0.19648754596710205, "kl": 0.4563448905944824, "learning_rate": 7.086794314220445e-06, "loss": 0.1238, "reward": 1.7338542103767396, "reward_std": 0.2317870318889618, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7359375178813934, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 588.489599609375, "epoch": 0.6349815970555289, "grad_norm": 0.3531632125377655, "kl": 0.7605800554156303, "learning_rate": 7.076104824500294e-06, "loss": 0.1186, "reward": 1.7307291984558106, "reward_std": 0.2941200569272041, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9187500178813934, "rewards/tag_count_reward": 0.736979192495346, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 573.0041809082031, "epoch": 0.6353016482637222, "grad_norm": 0.2388213574886322, "kl": 0.5445457905530929, "learning_rate": 7.0654189868561515e-06, "loss": 0.1156, "reward": 1.7270833611488343, "reward_std": 0.22447171062231064, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 582.5666870117187, "epoch": 0.6356216994719155, "grad_norm": 0.1977159082889557, "kl": 0.4999621480703354, "learning_rate": 7.054736814635106e-06, "loss": 0.0949, "reward": 1.803645873069763, "reward_std": 0.24051545932888985, "rewards/accuracy_reward": 0.12500000335276126, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7369791865348816, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 543.5979400634766, "epoch": 0.6359417506801088, "grad_norm": 0.3220207691192627, "kl": 0.5277846544981003, "learning_rate": 7.044058321179671e-06, "loss": 0.1443, "reward": 1.7963541984558105, "reward_std": 0.2664462685585022, "rewards/accuracy_reward": 0.1125000037252903, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7338541865348815, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 578.0437683105469, "epoch": 0.6362618018883022, "grad_norm": 0.14741215109825134, "kl": 0.372174845635891, "learning_rate": 7.033383519827763e-06, "loss": 0.099, "reward": 1.7000000476837158, "reward_std": 0.22901730239391327, "rewards/accuracy_reward": 0.022916667722165586, "rewards/format_reward": 0.9375000238418579, "rewards/tag_count_reward": 0.7395833611488343, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 549.4062713623047, "epoch": 0.6365818530964954, "grad_norm": 0.19315893948078156, "kl": 0.5451448887586594, "learning_rate": 7.022712423912682e-06, "loss": 0.115, "reward": 1.7458333730697633, "reward_std": 0.28968684524297716, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.9333333611488343, "rewards/tag_count_reward": 0.7312500238418579, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 536.4125213623047, "epoch": 0.6369019043046887, "grad_norm": 0.09960630536079407, "kl": 0.27057635635137556, "learning_rate": 7.012045046763111e-06, "loss": 0.0506, "reward": 1.8739583849906922, "reward_std": 0.16413490921258928, "rewards/accuracy_reward": 0.15625000223517418, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7427083373069763, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 501.5854248046875, "epoch": 0.6372219555128821, "grad_norm": 0.11986377090215683, "kl": 0.22691030353307723, "learning_rate": 7.00138140170308e-06, "loss": 0.1244, "reward": 1.7781250596046447, "reward_std": 0.19470058530569076, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7447916746139527, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 536.3812683105468, "epoch": 0.6375420067210754, "grad_norm": 0.1236194372177124, "kl": 0.204738799482584, "learning_rate": 6.990721502051958e-06, "loss": 0.073, "reward": 1.8322917222976685, "reward_std": 0.23166320994496345, "rewards/accuracy_reward": 0.13750000521540642, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7385416865348816, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 537.5916809082031, "epoch": 0.6378620579292686, "grad_norm": 0.24665555357933044, "kl": 0.27333122715353964, "learning_rate": 6.980065361124437e-06, "loss": 0.0945, "reward": 1.7286458611488342, "reward_std": 0.22112812250852584, "rewards/accuracy_reward": 0.03333333451300859, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7411458551883697, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 539.4541839599609, "epoch": 0.638182109137462, "grad_norm": 0.23802857100963593, "kl": 0.259119226410985, "learning_rate": 6.969412992230518e-06, "loss": 0.0884, "reward": 1.756770873069763, "reward_std": 0.16711597740650178, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7380208611488343, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 520.2521026611328, "epoch": 0.6385021603456553, "grad_norm": 0.1773439198732376, "kl": 0.22130618281662465, "learning_rate": 6.95876440867548e-06, "loss": 0.1208, "reward": 1.7510417222976684, "reward_std": 0.22782276272773744, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7343750178813935, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 524.5958526611328, "epoch": 0.6388222115538487, "grad_norm": 0.13545185327529907, "kl": 0.15213358253240586, "learning_rate": 6.948119623759888e-06, "loss": 0.0606, "reward": 1.8223958730697631, "reward_std": 0.18742815032601357, "rewards/accuracy_reward": 0.11875000204890966, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7390625178813934, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 522.2375183105469, "epoch": 0.6391422627620419, "grad_norm": 0.12271955609321594, "kl": 0.21166965663433074, "learning_rate": 6.937478650779548e-06, "loss": 0.086, "reward": 1.7151041984558106, "reward_std": 0.2101954735815525, "rewards/accuracy_reward": 0.016666666977107523, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7421875178813935, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 538.2770965576171, "epoch": 0.6394623139702352, "grad_norm": 0.10828928649425507, "kl": 0.31541863903403283, "learning_rate": 6.926841503025513e-06, "loss": 0.0909, "reward": 1.7812500596046448, "reward_std": 0.17954575568437575, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7416666865348815, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 523.4833404541016, "epoch": 0.6397823651784286, "grad_norm": 0.13755281269550323, "kl": 0.34499868750572205, "learning_rate": 6.916208193784062e-06, "loss": 0.0742, "reward": 1.8619792222976685, "reward_std": 0.22622655108571052, "rewards/accuracy_reward": 0.16666667070239782, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7411458492279053, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 545.8062744140625, "epoch": 0.6401024163866219, "grad_norm": 0.1748357117176056, "kl": 0.20692237839102745, "learning_rate": 6.905578736336673e-06, "loss": 0.1086, "reward": 1.760937511920929, "reward_std": 0.18903874084353448, "rewards/accuracy_reward": 0.05416666716337204, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7421875238418579, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 538.4271026611328, "epoch": 0.6404224675948151, "grad_norm": 0.11439496278762817, "kl": 0.3237125493586063, "learning_rate": 6.89495314396001e-06, "loss": 0.0511, "reward": 1.778125023841858, "reward_std": 0.17002107575535774, "rewards/accuracy_reward": 0.07500000111758709, "rewards/format_reward": 0.9625000059604645, "rewards/tag_count_reward": 0.740625011920929, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 518.7208526611328, "epoch": 0.6407425188030085, "grad_norm": 0.07779530435800552, "kl": 0.18974074572324753, "learning_rate": 6.884331429925919e-06, "loss": 0.0578, "reward": 1.717708373069763, "reward_std": 0.20087785869836808, "rewards/accuracy_reward": 0.01875000074505806, "rewards/format_reward": 0.9604166746139526, "rewards/tag_count_reward": 0.7385416805744172, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 531.1646087646484, "epoch": 0.6410625700112018, "grad_norm": 0.15299402177333832, "kl": 0.20888833254575728, "learning_rate": 6.8737136075013925e-06, "loss": 0.0718, "reward": 1.7515625357627869, "reward_std": 0.12168702185153961, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7453125178813934, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 511.0583526611328, "epoch": 0.6413826212193952, "grad_norm": 0.2596868574619293, "kl": 0.48813455291092395, "learning_rate": 6.863099689948569e-06, "loss": 0.123, "reward": 1.8130208849906921, "reward_std": 0.26384441256523133, "rewards/accuracy_reward": 0.11875000055879355, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7338541865348815, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 519.629183959961, "epoch": 0.6417026724275884, "grad_norm": 0.19190722703933716, "kl": 0.1685401350259781, "learning_rate": 6.852489690524703e-06, "loss": 0.0524, "reward": 1.8177083611488343, "reward_std": 0.12307879701256752, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.9833333492279053, "rewards/tag_count_reward": 0.7447916805744171, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 524.3729278564454, "epoch": 0.6420227236357817, "grad_norm": 0.21922363340854645, "kl": 0.2787697918713093, "learning_rate": 6.84188362248216e-06, "loss": 0.0813, "reward": 1.786458384990692, "reward_std": 0.16800421923398973, "rewards/accuracy_reward": 0.0687500024214387, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.746875011920929, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 536.2312652587891, "epoch": 0.6423427748439751, "grad_norm": 0.09753888100385666, "kl": 0.174386228621006, "learning_rate": 6.831281499068396e-06, "loss": 0.0773, "reward": 1.7661458730697632, "reward_std": 0.1401790753006935, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7453125178813934, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 539.6979370117188, "epoch": 0.6426628260521684, "grad_norm": 0.13030627369880676, "kl": 0.1947503164410591, "learning_rate": 6.820683333525942e-06, "loss": 0.0801, "reward": 1.7135417222976685, "reward_std": 0.15063118934631348, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7447916805744171, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 555.7104400634765, "epoch": 0.6429828772603616, "grad_norm": 0.15181896090507507, "kl": 0.35972325801849364, "learning_rate": 6.810089139092371e-06, "loss": 0.178, "reward": 1.7250000596046449, "reward_std": 0.2836567386984825, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9145833611488342, "rewards/tag_count_reward": 0.7395833492279053, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 565.7541931152343, "epoch": 0.643302928468555, "grad_norm": 0.19096185266971588, "kl": 0.5030779674649238, "learning_rate": 6.7994989290003165e-06, "loss": 0.2042, "reward": 1.663020873069763, "reward_std": 0.33288909047842025, "rewards/accuracy_reward": 0.027083334513008595, "rewards/format_reward": 0.8875000238418579, "rewards/tag_count_reward": 0.7484375178813935, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 564.2125183105469, "epoch": 0.6436229796767483, "grad_norm": 0.10490421950817108, "kl": 0.27069406807422636, "learning_rate": 6.788912716477417e-06, "loss": 0.1012, "reward": 1.7651042103767396, "reward_std": 0.26789544969797136, "rewards/accuracy_reward": 0.07708333507180214, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7380208492279052, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 534.5729309082031, "epoch": 0.6439430308849415, "grad_norm": 0.0868968516588211, "kl": 0.19741727262735367, "learning_rate": 6.7783305147463295e-06, "loss": 0.06, "reward": 1.7609375238418579, "reward_std": 0.1284565381705761, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7463541746139526, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 559.556265258789, "epoch": 0.6442630820931349, "grad_norm": 0.6286680102348328, "kl": 0.3144658826291561, "learning_rate": 6.7677523370247e-06, "loss": 0.0986, "reward": 1.763020896911621, "reward_std": 0.21080692261457443, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7401041805744171, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 536.675015258789, "epoch": 0.6445831333013282, "grad_norm": 0.06401374191045761, "kl": 0.15224939808249474, "learning_rate": 6.7571781965251405e-06, "loss": 0.056, "reward": 1.7734375238418578, "reward_std": 0.1219717726111412, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7442708373069763, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 541.820849609375, "epoch": 0.6449031845095216, "grad_norm": 0.1311735063791275, "kl": 0.2862920179963112, "learning_rate": 6.746608106455231e-06, "loss": 0.0856, "reward": 1.8614583730697631, "reward_std": 0.19478606656193734, "rewards/accuracy_reward": 0.15208333544433117, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7406250178813935, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 560.3146026611328, "epoch": 0.6452232357177148, "grad_norm": 0.10083205252885818, "kl": 0.260428823530674, "learning_rate": 6.736042080017488e-06, "loss": 0.0828, "reward": 1.7416666984558105, "reward_std": 0.19739598780870438, "rewards/accuracy_reward": 0.03541666828095913, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7395833492279053, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 537.060433959961, "epoch": 0.6455432869259081, "grad_norm": 0.1420152336359024, "kl": 0.3134328491985798, "learning_rate": 6.725480130409347e-06, "loss": 0.0788, "reward": 1.8203125476837159, "reward_std": 0.20216676890850066, "rewards/accuracy_reward": 0.11666667275130749, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7390625238418579, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 573.4458526611328, "epoch": 0.6458633381341015, "grad_norm": 0.1362895667552948, "kl": 0.3763896100223064, "learning_rate": 6.714922270823159e-06, "loss": 0.0935, "reward": 1.7828125476837158, "reward_std": 0.1553657740354538, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7411458611488342, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 555.6896057128906, "epoch": 0.6461833893422948, "grad_norm": 0.3411266505718231, "kl": 0.4690343365073204, "learning_rate": 6.704368514446165e-06, "loss": 0.1149, "reward": 1.7442708730697631, "reward_std": 0.2608324535191059, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7338541865348815, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 549.6354339599609, "epoch": 0.646503440550488, "grad_norm": 0.24700689315795898, "kl": 0.6471106797456742, "learning_rate": 6.693818874460475e-06, "loss": 0.1784, "reward": 1.7640625357627868, "reward_std": 0.3167236685752869, "rewards/accuracy_reward": 0.09791667070239782, "rewards/format_reward": 0.9375000238418579, "rewards/tag_count_reward": 0.7286458611488342, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 573.764599609375, "epoch": 0.6468234917586814, "grad_norm": 0.18885403871536255, "kl": 0.628251314163208, "learning_rate": 6.683273364043066e-06, "loss": 0.1355, "reward": 1.7161458611488343, "reward_std": 0.2554626792669296, "rewards/accuracy_reward": 0.03541666809469461, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7328125178813935, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 590.8416870117187, "epoch": 0.6471435429668747, "grad_norm": 0.3838038742542267, "kl": 0.5382826343178749, "learning_rate": 6.672731996365749e-06, "loss": 0.1536, "reward": 1.7510416984558106, "reward_std": 0.26691520512104033, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.7302083492279052, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 569.8083557128906, "epoch": 0.647463594175068, "grad_norm": 0.47697538137435913, "kl": 0.7491498619318009, "learning_rate": 6.662194784595164e-06, "loss": 0.1175, "reward": 1.8489584088325501, "reward_std": 0.20880869925022125, "rewards/accuracy_reward": 0.1687500050291419, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7302083492279052, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 536.7791809082031, "epoch": 0.6477836453832613, "grad_norm": 0.19957475364208221, "kl": 0.45173963755369184, "learning_rate": 6.651661741892763e-06, "loss": 0.0891, "reward": 1.7526041865348816, "reward_std": 0.19860148280858994, "rewards/accuracy_reward": 0.05625, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7338541865348815, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 535.2416809082031, "epoch": 0.6481036965914546, "grad_norm": 0.21536926925182343, "kl": 0.37086462080478666, "learning_rate": 6.641132881414791e-06, "loss": 0.0983, "reward": 1.7984375357627869, "reward_std": 0.18779040426015853, "rewards/accuracy_reward": 0.09583333656191825, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7380208611488343, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 557.7312683105469, "epoch": 0.648423747799648, "grad_norm": 0.3578295111656189, "kl": 0.3105326473712921, "learning_rate": 6.63060821631226e-06, "loss": 0.1002, "reward": 1.8119792222976685, "reward_std": 0.23218104541301726, "rewards/accuracy_reward": 0.11875000223517418, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7369791865348816, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 563.6146057128906, "epoch": 0.6487437990078413, "grad_norm": 0.24499280750751495, "kl": 0.26186653785407543, "learning_rate": 6.6200877597309535e-06, "loss": 0.0855, "reward": 1.7635416746139527, "reward_std": 0.14513484984636307, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7427083432674408, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 541.0458526611328, "epoch": 0.6490638502160345, "grad_norm": 0.30822300910949707, "kl": 0.48599352315068245, "learning_rate": 6.609571524811387e-06, "loss": 0.1068, "reward": 1.7473958611488343, "reward_std": 0.24330125153064727, "rewards/accuracy_reward": 0.05625000018626451, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7369791865348816, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 544.8708526611329, "epoch": 0.6493839014242279, "grad_norm": 0.16421116888523102, "kl": 0.3570775203406811, "learning_rate": 6.599059524688813e-06, "loss": 0.0979, "reward": 1.7531250596046448, "reward_std": 0.19463911652565002, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7427083551883698, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 561.38544921875, "epoch": 0.6497039526324212, "grad_norm": 0.24503841996192932, "kl": 0.3006763093173504, "learning_rate": 6.588551772493188e-06, "loss": 0.0805, "reward": 1.764062511920929, "reward_std": 0.1812247857451439, "rewards/accuracy_reward": 0.06041666716337204, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7369791865348816, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 581.9062805175781, "epoch": 0.6500240038406145, "grad_norm": 0.17819064855575562, "kl": 0.44421446323394775, "learning_rate": 6.578048281349165e-06, "loss": 0.1009, "reward": 1.6994791984558106, "reward_std": 0.2802324160933495, "rewards/accuracy_reward": 0.020833333395421504, "rewards/format_reward": 0.9437500298023224, "rewards/tag_count_reward": 0.7348958432674408, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 554.4812774658203, "epoch": 0.6503440550488078, "grad_norm": 0.11345545947551727, "kl": 0.21580078080296516, "learning_rate": 6.567549064376078e-06, "loss": 0.0719, "reward": 1.7338541984558105, "reward_std": 0.19010756611824037, "rewards/accuracy_reward": 0.027083334513008595, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7421875119209289, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 555.2791778564454, "epoch": 0.6506641062570011, "grad_norm": 0.12270388752222061, "kl": 0.2688896611332893, "learning_rate": 6.557054134687919e-06, "loss": 0.0888, "reward": 1.7796875476837157, "reward_std": 0.25824336111545565, "rewards/accuracy_reward": 0.0833333384245634, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7380208611488343, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 558.8437713623047, "epoch": 0.6509841574651944, "grad_norm": 0.07305414229631424, "kl": 0.26273534893989564, "learning_rate": 6.546563505393321e-06, "loss": 0.0784, "reward": 1.8338542222976684, "reward_std": 0.21032921522855758, "rewards/accuracy_reward": 0.12916667070239782, "rewards/format_reward": 0.9645833432674408, "rewards/tag_count_reward": 0.7401041805744171, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 551.4937622070313, "epoch": 0.6513042086733878, "grad_norm": 0.2946823537349701, "kl": 0.26316218823194504, "learning_rate": 6.536077189595554e-06, "loss": 0.0756, "reward": 1.7817708611488343, "reward_std": 0.23125024437904357, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7380208551883698, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 575.3229339599609, "epoch": 0.651624259881581, "grad_norm": 0.1420002579689026, "kl": 0.17707625590264797, "learning_rate": 6.525595200392492e-06, "loss": 0.0632, "reward": 1.8250000119209289, "reward_std": 0.15849317163228988, "rewards/accuracy_reward": 0.11666666772216558, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7479166865348816, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 546.462515258789, "epoch": 0.6519443110897744, "grad_norm": 0.19155986607074738, "kl": 0.36504338271915915, "learning_rate": 6.515117550876615e-06, "loss": 0.0828, "reward": 1.8067708730697631, "reward_std": 0.21143171042203904, "rewards/accuracy_reward": 0.10625000223517418, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7380208373069763, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 543.7979339599609, "epoch": 0.6522643622979677, "grad_norm": 0.07730981707572937, "kl": 0.13812856636941434, "learning_rate": 6.504644254134969e-06, "loss": 0.0477, "reward": 1.7963541984558105, "reward_std": 0.16553531140089034, "rewards/accuracy_reward": 0.07083333469927311, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.7463541805744172, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 564.2937744140625, "epoch": 0.652584413506161, "grad_norm": 0.1270231455564499, "kl": 0.2916695766150951, "learning_rate": 6.4941753232491725e-06, "loss": 0.0852, "reward": 1.734375011920929, "reward_std": 0.19963812083005905, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7364583551883698, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 571.2229370117187, "epoch": 0.6529044647143543, "grad_norm": 0.26288750767707825, "kl": 0.240986368060112, "learning_rate": 6.483710771295391e-06, "loss": 0.0956, "reward": 1.7734375476837159, "reward_std": 0.2022160619497299, "rewards/accuracy_reward": 0.07083333432674407, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7442708611488342, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 571.6500183105469, "epoch": 0.6532245159225476, "grad_norm": 0.123477503657341, "kl": 0.2088707573711872, "learning_rate": 6.4732506113443215e-06, "loss": 0.1214, "reward": 1.8208333849906921, "reward_std": 0.21984335780143738, "rewards/accuracy_reward": 0.13333333656191826, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7395833492279053, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 559.595849609375, "epoch": 0.6535445671307409, "grad_norm": 0.14980222284793854, "kl": 0.2989436075091362, "learning_rate": 6.462794856461167e-06, "loss": 0.0931, "reward": 1.745312547683716, "reward_std": 0.17107095420360566, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7473958551883697, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 558.3958557128906, "epoch": 0.6538646183389343, "grad_norm": 0.2861766219139099, "kl": 0.4346440315246582, "learning_rate": 6.452343519705637e-06, "loss": 0.1278, "reward": 1.735937523841858, "reward_std": 0.2187936559319496, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7338541865348815, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 558.0354370117187, "epoch": 0.6541846695471275, "grad_norm": 0.09741534292697906, "kl": 0.24645915627479553, "learning_rate": 6.441896614131918e-06, "loss": 0.0712, "reward": 1.7723958849906922, "reward_std": 0.16998118087649344, "rewards/accuracy_reward": 0.06250000298023224, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7411458551883697, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 569.9250183105469, "epoch": 0.6545047207553208, "grad_norm": 0.22848017513751984, "kl": 0.35786485224962233, "learning_rate": 6.431454152788659e-06, "loss": 0.0894, "reward": 1.7375000238418579, "reward_std": 0.19461821913719177, "rewards/accuracy_reward": 0.03541666716337204, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.743750023841858, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 571.7562805175781, "epoch": 0.6548247719635142, "grad_norm": 0.16886331140995026, "kl": 0.26599433794617655, "learning_rate": 6.421016148718968e-06, "loss": 0.1011, "reward": 1.8000000357627868, "reward_std": 0.23989782929420472, "rewards/accuracy_reward": 0.10625000223517418, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.735416692495346, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 571.939599609375, "epoch": 0.6551448231717075, "grad_norm": 0.28325778245925903, "kl": 0.4531488560140133, "learning_rate": 6.410582614960375e-06, "loss": 0.1435, "reward": 1.8041667222976685, "reward_std": 0.27296153604984286, "rewards/accuracy_reward": 0.13750000298023224, "rewards/format_reward": 0.9270833551883697, "rewards/tag_count_reward": 0.7395833492279053, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 571.6479370117188, "epoch": 0.6554648743799008, "grad_norm": 0.14918236434459686, "kl": 0.2889488823711872, "learning_rate": 6.400153564544831e-06, "loss": 0.095, "reward": 1.8020833730697632, "reward_std": 0.19902113303542138, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7416666865348815, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 570.733349609375, "epoch": 0.6557849255880941, "grad_norm": 0.33905163407325745, "kl": 0.5450758814811707, "learning_rate": 6.389729010498693e-06, "loss": 0.1293, "reward": 1.751562523841858, "reward_std": 0.21070124506950377, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7348958611488342, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 620.783349609375, "epoch": 0.6561049767962874, "grad_norm": 0.18853819370269775, "kl": 0.4303215779364109, "learning_rate": 6.379308965842689e-06, "loss": 0.1048, "reward": 1.7203125476837158, "reward_std": 0.2303944431245327, "rewards/accuracy_reward": 0.04375000111758709, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.732812511920929, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 612.1937683105468, "epoch": 0.6564250280044808, "grad_norm": 0.2386639267206192, "kl": 0.8055388882756234, "learning_rate": 6.368893443591924e-06, "loss": 0.1558, "reward": 1.6734375476837158, "reward_std": 0.32045554369688034, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9104166805744172, "rewards/tag_count_reward": 0.7234375178813934, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 555.7479248046875, "epoch": 0.656745079212674, "grad_norm": 0.3026362955570221, "kl": 0.38807725757360456, "learning_rate": 6.3584824567558525e-06, "loss": 0.0985, "reward": 1.7463542103767395, "reward_std": 0.18500390723347665, "rewards/accuracy_reward": 0.04375000223517418, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7380208611488343, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 577.1458435058594, "epoch": 0.6570651304208673, "grad_norm": 0.48022738099098206, "kl": 0.4987993150949478, "learning_rate": 6.34807601833826e-06, "loss": 0.1314, "reward": 1.6812500476837158, "reward_std": 0.25787831619381907, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.7312500178813934, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 572.7750213623046, "epoch": 0.6573851816290607, "grad_norm": 0.26611456274986267, "kl": 0.5643174603581429, "learning_rate": 6.33767414133726e-06, "loss": 0.1348, "reward": 1.7416667222976685, "reward_std": 0.2692039854824543, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9375000119209289, "rewards/tag_count_reward": 0.7333333492279053, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 570.2396057128906, "epoch": 0.6577052328372539, "grad_norm": 0.19766655564308167, "kl": 0.7188455149531364, "learning_rate": 6.327276838745257e-06, "loss": 0.1722, "reward": 1.702083373069763, "reward_std": 0.2696689248085022, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9354166805744171, "rewards/tag_count_reward": 0.7270833492279053, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 590.5333557128906, "epoch": 0.6580252840454472, "grad_norm": 0.38821715116500854, "kl": 0.7406436443328858, "learning_rate": 6.316884123548947e-06, "loss": 0.1643, "reward": 1.692708384990692, "reward_std": 0.32253962606191633, "rewards/accuracy_reward": 0.04166666734963655, "rewards/format_reward": 0.9208333551883697, "rewards/tag_count_reward": 0.7302083551883698, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 570.9312713623046, "epoch": 0.6583453352536406, "grad_norm": 0.3075641691684723, "kl": 0.5838876664638519, "learning_rate": 6.306496008729302e-06, "loss": 0.1149, "reward": 1.8093750476837158, "reward_std": 0.21804522201418877, "rewards/accuracy_reward": 0.12708333730697632, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7343750298023224, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 611.214599609375, "epoch": 0.6586653864618339, "grad_norm": 0.2144460529088974, "kl": 1.0372873276472092, "learning_rate": 6.29611250726154e-06, "loss": 0.2162, "reward": 1.6010417103767396, "reward_std": 0.4080072194337845, "rewards/accuracy_reward": 0.012500000186264515, "rewards/format_reward": 0.8895833492279053, "rewards/tag_count_reward": 0.6989583432674408, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 597.3250244140625, "epoch": 0.6589854376700272, "grad_norm": 0.2626437842845917, "kl": 0.6710316807031631, "learning_rate": 6.285733632115118e-06, "loss": 0.1558, "reward": 1.731770884990692, "reward_std": 0.3729784592986107, "rewards/accuracy_reward": 0.09583333507180214, "rewards/format_reward": 0.916666692495346, "rewards/tag_count_reward": 0.7192708611488342, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 578.3000183105469, "epoch": 0.6593054888782205, "grad_norm": 0.42148250341415405, "kl": 0.7080471813678741, "learning_rate": 6.275359396253721e-06, "loss": 0.1478, "reward": 1.6453125476837158, "reward_std": 0.3132076248526573, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.9187500238418579, "rewards/tag_count_reward": 0.7244791865348816, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 542.527099609375, "epoch": 0.6596255400864138, "grad_norm": 0.3597671091556549, "kl": 0.5296546339988708, "learning_rate": 6.264989812635227e-06, "loss": 0.1318, "reward": 1.7067708492279052, "reward_std": 0.2610057801008224, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.9333333551883698, "rewards/tag_count_reward": 0.729687511920929, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 566.3333526611328, "epoch": 0.6599455912946072, "grad_norm": 0.3411342203617096, "kl": 0.5472671233117581, "learning_rate": 6.2546248942117134e-06, "loss": 0.1035, "reward": 1.7197917103767395, "reward_std": 0.2231937639415264, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7385416924953461, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 589.7750183105469, "epoch": 0.6602656425028004, "grad_norm": 0.22413881123065948, "kl": 0.5802815616130829, "learning_rate": 6.244264653929428e-06, "loss": 0.1309, "reward": 1.745833396911621, "reward_std": 0.306681290268898, "rewards/accuracy_reward": 0.0916666679084301, "rewards/format_reward": 0.9333333551883698, "rewards/tag_count_reward": 0.7208333432674408, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 547.4541778564453, "epoch": 0.6605856937109937, "grad_norm": 0.3446758985519409, "kl": 0.7506268709897995, "learning_rate": 6.2339091047287725e-06, "loss": 0.1382, "reward": 1.793750035762787, "reward_std": 0.32375111281871793, "rewards/accuracy_reward": 0.16250000502914191, "rewards/format_reward": 0.9166666805744171, "rewards/tag_count_reward": 0.7145833492279052, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 538.904183959961, "epoch": 0.6609057449191871, "grad_norm": 0.31046417355537415, "kl": 0.7741656035184861, "learning_rate": 6.2235582595442935e-06, "loss": 0.1824, "reward": 1.8447916984558106, "reward_std": 0.2975379958748817, "rewards/accuracy_reward": 0.18541667237877846, "rewards/format_reward": 0.931250023841858, "rewards/tag_count_reward": 0.7281250178813934, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 566.283349609375, "epoch": 0.6612257961273804, "grad_norm": 0.21272774040699005, "kl": 0.6825548261404037, "learning_rate": 6.213212131304664e-06, "loss": 0.1454, "reward": 1.722395896911621, "reward_std": 0.3414647400379181, "rewards/accuracy_reward": 0.08541666828095913, "rewards/format_reward": 0.9208333551883697, "rewards/tag_count_reward": 0.7161458551883697, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 578.404183959961, "epoch": 0.6615458473355736, "grad_norm": 0.18681898713111877, "kl": 0.5913916632533074, "learning_rate": 6.202870732932656e-06, "loss": 0.1527, "reward": 1.7916667103767394, "reward_std": 0.31743351817131044, "rewards/accuracy_reward": 0.1458333356305957, "rewards/format_reward": 0.9187500178813934, "rewards/tag_count_reward": 0.7270833551883698, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 565.8375244140625, "epoch": 0.661865898543767, "grad_norm": 0.2054324448108673, "kl": 0.5609342604875565, "learning_rate": 6.19253407734514e-06, "loss": 0.1721, "reward": 1.7661458730697632, "reward_std": 0.2891497790813446, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.9250000298023224, "rewards/tag_count_reward": 0.7244791865348816, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 550.2208526611328, "epoch": 0.6621859497519603, "grad_norm": 0.1147092655301094, "kl": 0.2838616266846657, "learning_rate": 6.182202177453063e-06, "loss": 0.1007, "reward": 1.7598958611488342, "reward_std": 0.22217274978756904, "rewards/accuracy_reward": 0.06875000223517418, "rewards/format_reward": 0.9520833611488342, "rewards/tag_count_reward": 0.7390625178813934, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 568.927099609375, "epoch": 0.6625060009601537, "grad_norm": 0.1614256501197815, "kl": 0.4955372139811516, "learning_rate": 6.171875046161429e-06, "loss": 0.1084, "reward": 1.7739583611488343, "reward_std": 0.2446311503648758, "rewards/accuracy_reward": 0.0979166679084301, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7322916865348816, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 534.2250244140625, "epoch": 0.6628260521683469, "grad_norm": 0.15170036256313324, "kl": 0.2690488576889038, "learning_rate": 6.161552696369291e-06, "loss": 0.0638, "reward": 1.7848958611488341, "reward_std": 0.19568712040781974, "rewards/accuracy_reward": 0.08125000409781932, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7411458432674408, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 559.9646026611329, "epoch": 0.6631461033765402, "grad_norm": 0.11659003794193268, "kl": 0.32981459945440295, "learning_rate": 6.151235140969719e-06, "loss": 0.1103, "reward": 1.7593750476837158, "reward_std": 0.25664503276348116, "rewards/accuracy_reward": 0.07500000167638063, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7343750298023224, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 561.8000183105469, "epoch": 0.6634661545847336, "grad_norm": 0.11869725584983826, "kl": 0.30063339732587335, "learning_rate": 6.1409223928498085e-06, "loss": 0.0911, "reward": 1.8427083849906922, "reward_std": 0.17913474664092063, "rewards/accuracy_reward": 0.1354166718199849, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7427083551883698, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 551.7979339599609, "epoch": 0.6637862057929269, "grad_norm": 0.17836003005504608, "kl": 0.41804255843162536, "learning_rate": 6.130614464890645e-06, "loss": 0.1133, "reward": 1.7947917342185975, "reward_std": 0.2575798869132996, "rewards/accuracy_reward": 0.10625000335276127, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7343750238418579, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 543.6479339599609, "epoch": 0.6641062570011201, "grad_norm": 0.2909286916255951, "kl": 0.6877239581197501, "learning_rate": 6.120311369967286e-06, "loss": 0.0902, "reward": 1.7838542222976685, "reward_std": 0.179420168697834, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7338541865348815, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 530.4791809082031, "epoch": 0.6644263082093135, "grad_norm": 0.16476832330226898, "kl": 0.2650918196886778, "learning_rate": 6.11001312094876e-06, "loss": 0.0947, "reward": 1.8890625476837157, "reward_std": 0.19297146275639535, "rewards/accuracy_reward": 0.17500000298023224, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7453125178813934, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 560.3083557128906, "epoch": 0.6647463594175068, "grad_norm": 0.15465334057807922, "kl": 0.4669275902211666, "learning_rate": 6.099719730698046e-06, "loss": 0.1589, "reward": 1.7192708849906921, "reward_std": 0.27212869971990583, "rewards/accuracy_reward": 0.04375000223517418, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.7359375178813934, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 562.4958435058594, "epoch": 0.6650664106257002, "grad_norm": 0.1373804807662964, "kl": 0.41616974845528604, "learning_rate": 6.089431212072043e-06, "loss": 0.098, "reward": 1.7343750238418578, "reward_std": 0.2505802020430565, "rewards/accuracy_reward": 0.039583333767950535, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7427083611488342, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 533.3833557128906, "epoch": 0.6653864618338934, "grad_norm": 0.13085642457008362, "kl": 0.3719322353601456, "learning_rate": 6.079147577921576e-06, "loss": 0.1556, "reward": 1.8166666984558106, "reward_std": 0.202004524320364, "rewards/accuracy_reward": 0.12500000447034837, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7479166865348816, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 554.2479309082031, "epoch": 0.6657065130420867, "grad_norm": 0.14317762851715088, "kl": 0.3922991409897804, "learning_rate": 6.068868841091361e-06, "loss": 0.125, "reward": 1.7104166984558105, "reward_std": 0.25869596004486084, "rewards/accuracy_reward": 0.02708333358168602, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7375000178813934, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 544.2895965576172, "epoch": 0.6660265642502801, "grad_norm": 0.15368254482746124, "kl": 0.27517750635743143, "learning_rate": 6.05859501442e-06, "loss": 0.0966, "reward": 1.7619792222976685, "reward_std": 0.2081921711564064, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7473958551883697, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 551.2271057128906, "epoch": 0.6663466154584734, "grad_norm": 0.20891135931015015, "kl": 0.4285693295300007, "learning_rate": 6.048326110739968e-06, "loss": 0.1587, "reward": 1.7369792222976685, "reward_std": 0.22642296701669692, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.9375000298023224, "rewards/tag_count_reward": 0.743229192495346, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 503.933349609375, "epoch": 0.6666666666666666, "grad_norm": 0.05916117876768112, "kl": 0.23351404666900635, "learning_rate": 6.038062142877583e-06, "loss": 0.06, "reward": 1.798958384990692, "reward_std": 0.12938002720475197, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7510416805744171, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 556.4312683105469, "epoch": 0.66698671787486, "grad_norm": 0.16329483687877655, "kl": 0.3161396749317646, "learning_rate": 6.027803123653e-06, "loss": 0.1275, "reward": 1.7046875476837158, "reward_std": 0.2415686145424843, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9291666865348815, "rewards/tag_count_reward": 0.7421875298023224, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 545.9396026611328, "epoch": 0.6673067690830533, "grad_norm": 0.09667642414569855, "kl": 0.3785520136356354, "learning_rate": 6.0175490658801934e-06, "loss": 0.1069, "reward": 1.690625047683716, "reward_std": 0.22385728284716605, "rewards/accuracy_reward": 0.010416667163372039, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7343750178813935, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 544.395849609375, "epoch": 0.6676268202912466, "grad_norm": 0.08739566057920456, "kl": 0.34865760840475557, "learning_rate": 6.00729998236694e-06, "loss": 0.1112, "reward": 1.7708333730697632, "reward_std": 0.20698952823877334, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.9395833551883698, "rewards/tag_count_reward": 0.7395833611488343, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 578.7646118164063, "epoch": 0.6679468714994399, "grad_norm": 0.12396329641342163, "kl": 0.40712554156780245, "learning_rate": 5.997055885914806e-06, "loss": 0.1277, "reward": 1.7416667222976685, "reward_std": 0.2734927900135517, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.9375000119209289, "rewards/tag_count_reward": 0.7354166865348816, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 545.6958526611328, "epoch": 0.6682669227076332, "grad_norm": 0.20949289202690125, "kl": 0.435461837798357, "learning_rate": 5.986816789319123e-06, "loss": 0.1051, "reward": 1.7489583849906922, "reward_std": 0.21258105263113974, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7343750178813935, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 547.9646057128906, "epoch": 0.6685869739158266, "grad_norm": 0.08775470405817032, "kl": 0.4969747729599476, "learning_rate": 5.976582705368982e-06, "loss": 0.0677, "reward": 1.7697917103767395, "reward_std": 0.2106465920805931, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583551883698, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 555.9708465576172, "epoch": 0.6689070251240199, "grad_norm": 0.07833829522132874, "kl": 0.30936159677803515, "learning_rate": 5.966353646847215e-06, "loss": 0.115, "reward": 1.7484375476837157, "reward_std": 0.23181662410497667, "rewards/accuracy_reward": 0.060416669212281705, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.735937523841858, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 538.608349609375, "epoch": 0.6692270763322131, "grad_norm": 0.10873490571975708, "kl": 0.26770354211330416, "learning_rate": 5.956129626530376e-06, "loss": 0.0866, "reward": 1.7546875596046447, "reward_std": 0.1789041481912136, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7401041805744171, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 535.8583465576172, "epoch": 0.6695471275404065, "grad_norm": 0.17797769606113434, "kl": 0.4399689495563507, "learning_rate": 5.945910657188717e-06, "loss": 0.1301, "reward": 1.7484375476837157, "reward_std": 0.2705027997493744, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7338541924953461, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 535.7645965576172, "epoch": 0.6698671787485998, "grad_norm": 0.2502882480621338, "kl": 0.25185356885194776, "learning_rate": 5.9356967515861955e-06, "loss": 0.0978, "reward": 1.8630208849906922, "reward_std": 0.2282662123441696, "rewards/accuracy_reward": 0.16250000577419996, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7421875059604645, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 540.6791839599609, "epoch": 0.6701872299567931, "grad_norm": 0.24098429083824158, "kl": 0.21436882950365543, "learning_rate": 5.925487922480431e-06, "loss": 0.0998, "reward": 1.8354167222976685, "reward_std": 0.20333680436015128, "rewards/accuracy_reward": 0.1291666692122817, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7416666805744171, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 543.0687683105468, "epoch": 0.6705072811649864, "grad_norm": 0.16339395940303802, "kl": 0.5092102646827698, "learning_rate": 5.9152841826227136e-06, "loss": 0.1156, "reward": 1.8442708849906921, "reward_std": 0.2604293584823608, "rewards/accuracy_reward": 0.15208333637565374, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7338541924953461, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 539.2104309082031, "epoch": 0.6708273323731797, "grad_norm": 0.09706208854913712, "kl": 0.4753930263221264, "learning_rate": 5.905085544757965e-06, "loss": 0.12, "reward": 1.8401042103767395, "reward_std": 0.21673982813954354, "rewards/accuracy_reward": 0.15208333730697632, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7338541865348815, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 552.1937713623047, "epoch": 0.671147383581373, "grad_norm": 0.19295969605445862, "kl": 0.42010639905929564, "learning_rate": 5.894892021624744e-06, "loss": 0.1263, "reward": 1.7885417103767396, "reward_std": 0.25435220301151273, "rewards/accuracy_reward": 0.11041666977107525, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7302083611488343, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 548.1750183105469, "epoch": 0.6714674347895663, "grad_norm": 0.11223884671926498, "kl": 0.31995190382003785, "learning_rate": 5.884703625955219e-06, "loss": 0.1004, "reward": 1.7244791865348816, "reward_std": 0.22140763103961944, "rewards/accuracy_reward": 0.03125000149011612, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7348958551883698, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 545.5104370117188, "epoch": 0.6717874859977596, "grad_norm": 0.14636114239692688, "kl": 0.4229055255651474, "learning_rate": 5.874520370475154e-06, "loss": 0.0831, "reward": 1.7130208730697631, "reward_std": 0.16997253373265267, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7401041924953461, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 569.6271118164062, "epoch": 0.672107537205953, "grad_norm": 0.10089104622602463, "kl": 0.27489122599363325, "learning_rate": 5.864342267903885e-06, "loss": 0.0913, "reward": 1.754687535762787, "reward_std": 0.2580702304840088, "rewards/accuracy_reward": 0.06666666939854622, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7338541984558106, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 554.5208526611328, "epoch": 0.6724275884141463, "grad_norm": 0.10464661568403244, "kl": 0.20850623920559883, "learning_rate": 5.854169330954324e-06, "loss": 0.0723, "reward": 1.8265625596046449, "reward_std": 0.15675044655799866, "rewards/accuracy_reward": 0.10833333786576986, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.743229192495346, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 544.170849609375, "epoch": 0.6727476396223395, "grad_norm": 0.18542306125164032, "kl": 0.2565278984606266, "learning_rate": 5.84400157233292e-06, "loss": 0.0743, "reward": 1.7557292222976684, "reward_std": 0.227911539375782, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7411458671092988, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 551.0125183105469, "epoch": 0.6730676908305329, "grad_norm": 0.12582838535308838, "kl": 0.4552136674523354, "learning_rate": 5.833839004739662e-06, "loss": 0.1309, "reward": 1.7572917103767396, "reward_std": 0.26424730867147445, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7302083432674408, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 520.989599609375, "epoch": 0.6733877420387262, "grad_norm": 0.10714533179998398, "kl": 0.242107355594635, "learning_rate": 5.823681640868049e-06, "loss": 0.092, "reward": 1.7453125357627868, "reward_std": 0.1701894871890545, "rewards/accuracy_reward": 0.0312500013038516, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7411458551883697, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 540.629183959961, "epoch": 0.6737077932469195, "grad_norm": 0.10627640783786774, "kl": 0.43605479300022126, "learning_rate": 5.8135294934050855e-06, "loss": 0.0705, "reward": 1.7822916984558106, "reward_std": 0.16257481276988983, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7385416805744172, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 565.1437683105469, "epoch": 0.6740278444551128, "grad_norm": 0.16275854408740997, "kl": 0.2939316496253014, "learning_rate": 5.803382575031257e-06, "loss": 0.1094, "reward": 1.7255208849906922, "reward_std": 0.21441070288419722, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7380208492279052, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 550.0083526611328, "epoch": 0.6743478956633061, "grad_norm": 0.2260814607143402, "kl": 0.22412323877215384, "learning_rate": 5.793240898420521e-06, "loss": 0.0938, "reward": 1.7854167103767395, "reward_std": 0.16013515293598174, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7416666805744171, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 540.847933959961, "epoch": 0.6746679468714994, "grad_norm": 0.14437879621982574, "kl": 0.19934857040643691, "learning_rate": 5.783104476240284e-06, "loss": 0.0919, "reward": 1.806770884990692, "reward_std": 0.19802356511354446, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7421875298023224, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 551.7958557128907, "epoch": 0.6749879980796928, "grad_norm": 0.14482718706130981, "kl": 0.23657144084572793, "learning_rate": 5.772973321151392e-06, "loss": 0.0593, "reward": 1.7963541865348815, "reward_std": 0.12375809252262115, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7421875119209289, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 550.0229370117188, "epoch": 0.675308049287886, "grad_norm": 0.15969114005565643, "kl": 0.32010622769594194, "learning_rate": 5.762847445808111e-06, "loss": 0.0906, "reward": 1.7276041865348817, "reward_std": 0.20104559063911437, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7359375178813934, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 562.560433959961, "epoch": 0.6756281004960794, "grad_norm": 0.09771303087472916, "kl": 0.2898152723908424, "learning_rate": 5.7527268628581175e-06, "loss": 0.0973, "reward": 1.7750000357627869, "reward_std": 0.20687551125884057, "rewards/accuracy_reward": 0.07291666902601719, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7395833551883697, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 548.3541931152344, "epoch": 0.6759481517042727, "grad_norm": 0.12097357213497162, "kl": 0.20418170019984244, "learning_rate": 5.7426115849424635e-06, "loss": 0.0705, "reward": 1.7692708849906922, "reward_std": 0.17754657715559005, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7380208492279052, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 559.9541870117188, "epoch": 0.676268202912466, "grad_norm": 0.14658434689044952, "kl": 0.29820197224617007, "learning_rate": 5.73250162469559e-06, "loss": 0.054, "reward": 1.7385416984558106, "reward_std": 0.18357955366373063, "rewards/accuracy_reward": 0.03541666809469461, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7406250178813935, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 541.6000152587891, "epoch": 0.6765882541206593, "grad_norm": 0.15589158236980438, "kl": 0.24293862506747246, "learning_rate": 5.722396994745284e-06, "loss": 0.0957, "reward": 1.790625023841858, "reward_std": 0.2170632876455784, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7406250238418579, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 547.4604370117188, "epoch": 0.6769083053288526, "grad_norm": 0.13007795810699463, "kl": 0.1325427707284689, "learning_rate": 5.712297707712694e-06, "loss": 0.0346, "reward": 1.7927083730697633, "reward_std": 0.0649714283645153, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.987500011920929, "rewards/tag_count_reward": 0.7489583373069764, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 554.7604431152344, "epoch": 0.6772283565370459, "grad_norm": 0.09760616719722748, "kl": 0.18721306025981904, "learning_rate": 5.702203776212269e-06, "loss": 0.0737, "reward": 1.8114583849906922, "reward_std": 0.1576378509402275, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7427083492279053, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 523.0771026611328, "epoch": 0.6775484077452393, "grad_norm": 0.24982647597789764, "kl": 0.19170795604586602, "learning_rate": 5.692115212851786e-06, "loss": 0.0645, "reward": 1.8317708730697633, "reward_std": 0.17303552776575087, "rewards/accuracy_reward": 0.10833333544433117, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7463541805744172, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 531.4708557128906, "epoch": 0.6778684589534325, "grad_norm": 0.1240825280547142, "kl": 0.2379318844527006, "learning_rate": 5.682032030232314e-06, "loss": 0.0579, "reward": 1.778125035762787, "reward_std": 0.1435864046216011, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7406250178813935, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 575.1812683105469, "epoch": 0.6781885101616258, "grad_norm": 0.13775448501110077, "kl": 0.38389052599668505, "learning_rate": 5.6719542409482e-06, "loss": 0.0865, "reward": 1.7364583730697631, "reward_std": 0.22209640294313432, "rewards/accuracy_reward": 0.05416666846722364, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7385416924953461, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 575.2541870117187, "epoch": 0.6785085613698192, "grad_norm": 0.1977604329586029, "kl": 0.2928472336381674, "learning_rate": 5.6618818575870486e-06, "loss": 0.0951, "reward": 1.7718750476837157, "reward_std": 0.2150444954633713, "rewards/accuracy_reward": 0.07291666809469462, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.740625011920929, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 546.2041870117188, "epoch": 0.6788286125780125, "grad_norm": 0.1186528429389, "kl": 0.27777676060795786, "learning_rate": 5.6518148927297215e-06, "loss": 0.0865, "reward": 1.8359375238418578, "reward_std": 0.18982706367969512, "rewards/accuracy_reward": 0.12708333749324083, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7401041865348816, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 566.9166778564453, "epoch": 0.6791486637862058, "grad_norm": 0.24591495096683502, "kl": 0.3266368605196476, "learning_rate": 5.6417533589503036e-06, "loss": 0.0709, "reward": 1.7572916865348815, "reward_std": 0.1611829034984112, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7427083492279053, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 564.0229339599609, "epoch": 0.6794687149943991, "grad_norm": 0.1680237203836441, "kl": 0.30815255306661127, "learning_rate": 5.631697268816114e-06, "loss": 0.0951, "reward": 1.7854167103767395, "reward_std": 0.20621230602264404, "rewards/accuracy_reward": 0.07708333600312471, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7395833432674408, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 560.6750244140625, "epoch": 0.6797887662025924, "grad_norm": 0.19045765697956085, "kl": 0.41359340101480485, "learning_rate": 5.621646634887647e-06, "loss": 0.1132, "reward": 1.678645873069763, "reward_std": 0.2574477940797806, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7328125238418579, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 533.6187683105469, "epoch": 0.6801088174107858, "grad_norm": 0.14711131155490875, "kl": 0.46874346137046813, "learning_rate": 5.611601469718601e-06, "loss": 0.1238, "reward": 1.8359375476837159, "reward_std": 0.24985048174858093, "rewards/accuracy_reward": 0.14375000298023224, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7380208551883698, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 552.0187744140625, "epoch": 0.680428868618979, "grad_norm": 0.24069844186306, "kl": 0.36626454442739487, "learning_rate": 5.601561785855833e-06, "loss": 0.1027, "reward": 1.8125000596046448, "reward_std": 0.24355322867631912, "rewards/accuracy_reward": 0.11250000316649675, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7375000238418579, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 530.9208526611328, "epoch": 0.6807489198271723, "grad_norm": 0.26626524329185486, "kl": 0.39541344419121743, "learning_rate": 5.591527595839365e-06, "loss": 0.1121, "reward": 1.7197916746139525, "reward_std": 0.18664910271763802, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250178813935, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 545.7750183105469, "epoch": 0.6810689710353657, "grad_norm": 0.21741662919521332, "kl": 0.30557314343750475, "learning_rate": 5.5814989122023385e-06, "loss": 0.1023, "reward": 1.8036458849906922, "reward_std": 0.1822378784418106, "rewards/accuracy_reward": 0.10416667070239782, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7432291746139527, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 564.6750213623047, "epoch": 0.681389022243559, "grad_norm": 0.1766936480998993, "kl": 0.3221236035227776, "learning_rate": 5.571475747471036e-06, "loss": 0.0895, "reward": 1.7989583611488342, "reward_std": 0.1792902246117592, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.9604166746139526, "rewards/tag_count_reward": 0.7385416746139526, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 569.2250183105468, "epoch": 0.6817090734517522, "grad_norm": 0.20995013415813446, "kl": 0.5006738707423211, "learning_rate": 5.561458114164837e-06, "loss": 0.156, "reward": 1.7026041865348815, "reward_std": 0.28156317621469495, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7276041865348816, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 535.9729248046875, "epoch": 0.6820291246599456, "grad_norm": 0.1605108231306076, "kl": 0.27917307913303374, "learning_rate": 5.551446024796214e-06, "loss": 0.0716, "reward": 1.850520873069763, "reward_std": 0.19164575263857841, "rewards/accuracy_reward": 0.13750000596046447, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7442708432674408, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 553.6562683105469, "epoch": 0.6823491758681389, "grad_norm": 0.3428249955177307, "kl": 0.2938417851924896, "learning_rate": 5.541439491870716e-06, "loss": 0.1212, "reward": 1.7604167103767394, "reward_std": 0.2516637593507767, "rewards/accuracy_reward": 0.07708333693444729, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7375000178813934, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 584.7375183105469, "epoch": 0.6826692270763323, "grad_norm": 0.12911154329776764, "kl": 0.3031469196081161, "learning_rate": 5.53143852788695e-06, "loss": 0.0998, "reward": 1.7541667222976685, "reward_std": 0.24215374365448952, "rewards/accuracy_reward": 0.07083333507180214, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.7437500178813934, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 563.9583557128906, "epoch": 0.6829892782845255, "grad_norm": 0.13713647425174713, "kl": 0.36711022555828093, "learning_rate": 5.521443145336568e-06, "loss": 0.1164, "reward": 1.757812535762787, "reward_std": 0.24515254348516463, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7328125298023224, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 574.520849609375, "epoch": 0.6833093294927188, "grad_norm": 0.2623184025287628, "kl": 0.4985116317868233, "learning_rate": 5.511453356704251e-06, "loss": 0.1464, "reward": 1.6989583492279052, "reward_std": 0.22973346561193467, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7364583492279053, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 542.529183959961, "epoch": 0.6836293807009122, "grad_norm": 0.2138541042804718, "kl": 0.3363912686705589, "learning_rate": 5.501469174467695e-06, "loss": 0.0797, "reward": 1.8890625476837157, "reward_std": 0.17162477001547813, "rewards/accuracy_reward": 0.17291666977107525, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7432291805744171, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 561.3000091552734, "epoch": 0.6839494319091055, "grad_norm": 0.20941811800003052, "kl": 0.5751274846494198, "learning_rate": 5.491490611097586e-06, "loss": 0.1335, "reward": 1.8187500476837157, "reward_std": 0.27915480434894563, "rewards/accuracy_reward": 0.15000000577419997, "rewards/format_reward": 0.9354166865348816, "rewards/tag_count_reward": 0.7333333492279053, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 547.314599609375, "epoch": 0.6842694831172987, "grad_norm": 0.1732366532087326, "kl": 0.4344419322907925, "learning_rate": 5.481517679057595e-06, "loss": 0.1137, "reward": 1.8453125596046447, "reward_std": 0.21131417751312256, "rewards/accuracy_reward": 0.15416667181998492, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7390625238418579, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 572.8166931152343, "epoch": 0.6845895343254921, "grad_norm": 0.3594232499599457, "kl": 0.330796167999506, "learning_rate": 5.4715503908043654e-06, "loss": 0.0919, "reward": 1.7781250119209289, "reward_std": 0.27583343237638475, "rewards/accuracy_reward": 0.09375000055879354, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7364583492279053, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 545.20419921875, "epoch": 0.6849095855336854, "grad_norm": 0.24212028086185455, "kl": 0.5013069108128547, "learning_rate": 5.461588758787484e-06, "loss": 0.1604, "reward": 1.7833333730697631, "reward_std": 0.26907457038760185, "rewards/accuracy_reward": 0.1062500026077032, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7354166805744171, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 543.8687744140625, "epoch": 0.6852296367418788, "grad_norm": 0.21253329515457153, "kl": 0.4218080222606659, "learning_rate": 5.4516327954494764e-06, "loss": 0.1252, "reward": 1.7338542103767396, "reward_std": 0.2643717348575592, "rewards/accuracy_reward": 0.05000000260770321, "rewards/format_reward": 0.9437500298023224, "rewards/tag_count_reward": 0.7401041924953461, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 539.6500183105469, "epoch": 0.685549687950072, "grad_norm": 0.22795309126377106, "kl": 0.4938424080610275, "learning_rate": 5.441682513225786e-06, "loss": 0.1201, "reward": 1.7203125357627869, "reward_std": 0.2679354429244995, "rewards/accuracy_reward": 0.050000001676380634, "rewards/format_reward": 0.9333333551883698, "rewards/tag_count_reward": 0.736979192495346, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 530.3500122070312, "epoch": 0.6858697391582653, "grad_norm": 0.15360769629478455, "kl": 0.2627089634537697, "learning_rate": 5.431737924544763e-06, "loss": 0.0813, "reward": 1.8484375596046447, "reward_std": 0.18890125900506974, "rewards/accuracy_reward": 0.14375000353902578, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7421875178813935, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 575.5666809082031, "epoch": 0.6861897903664587, "grad_norm": 0.31977367401123047, "kl": 0.3762332767248154, "learning_rate": 5.421799041827646e-06, "loss": 0.0954, "reward": 1.8666667103767396, "reward_std": 0.22807515114545823, "rewards/accuracy_reward": 0.18541667200624942, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7375000238418579, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 583.3625183105469, "epoch": 0.6865098415746519, "grad_norm": 0.17487220466136932, "kl": 0.3942092776298523, "learning_rate": 5.411865877488536e-06, "loss": 0.1055, "reward": 1.7192708849906921, "reward_std": 0.22079168483614922, "rewards/accuracy_reward": 0.03333333395421505, "rewards/format_reward": 0.943750011920929, "rewards/tag_count_reward": 0.7421875119209289, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 571.1604461669922, "epoch": 0.6868298927828452, "grad_norm": 0.15001332759857178, "kl": 0.41321970373392103, "learning_rate": 5.401938443934405e-06, "loss": 0.1174, "reward": 1.7677083730697631, "reward_std": 0.2351425528526306, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7385416924953461, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 565.3229370117188, "epoch": 0.6871499439910386, "grad_norm": 0.30220097303390503, "kl": 0.42144326865673065, "learning_rate": 5.392016753565059e-06, "loss": 0.181, "reward": 1.6854167103767395, "reward_std": 0.3153300307691097, "rewards/accuracy_reward": 0.027083334140479566, "rewards/format_reward": 0.9250000178813934, "rewards/tag_count_reward": 0.7333333432674408, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 534.4375213623047, "epoch": 0.6874699951992319, "grad_norm": 0.13343147933483124, "kl": 0.2602052837610245, "learning_rate": 5.382100818773144e-06, "loss": 0.0989, "reward": 1.7572917103767396, "reward_std": 0.1719522811472416, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7427083432674408, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 552.0833618164063, "epoch": 0.6877900464074251, "grad_norm": 0.37386593222618103, "kl": 0.31635802537202834, "learning_rate": 5.3721906519440945e-06, "loss": 0.1481, "reward": 1.781250035762787, "reward_std": 0.2873902410268784, "rewards/accuracy_reward": 0.08958333656191826, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7375000178813934, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 550.1146026611328, "epoch": 0.6881100976156185, "grad_norm": 0.1896275281906128, "kl": 0.336704520881176, "learning_rate": 5.362286265456158e-06, "loss": 0.1424, "reward": 1.7984375476837158, "reward_std": 0.2562291607260704, "rewards/accuracy_reward": 0.11041666977107525, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7401041805744171, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 557.7875244140625, "epoch": 0.6884301488238118, "grad_norm": 0.19244590401649475, "kl": 0.5367755405604839, "learning_rate": 5.352387671680357e-06, "loss": 0.1137, "reward": 1.7083333611488343, "reward_std": 0.2284008175134659, "rewards/accuracy_reward": 0.020833334326744078, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7354166865348816, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 552.3708526611329, "epoch": 0.6887502000320052, "grad_norm": 0.290466845035553, "kl": 0.4612713187932968, "learning_rate": 5.34249488298048e-06, "loss": 0.1148, "reward": 1.8708333730697633, "reward_std": 0.228383469581604, "rewards/accuracy_reward": 0.18333333935588597, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7333333551883697, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 594.0833435058594, "epoch": 0.6890702512401984, "grad_norm": 0.2586331367492676, "kl": 0.47631366848945617, "learning_rate": 5.332607911713057e-06, "loss": 0.1424, "reward": 1.7338542222976685, "reward_std": 0.33738535493612287, "rewards/accuracy_reward": 0.0770833358168602, "rewards/format_reward": 0.9333333551883698, "rewards/tag_count_reward": 0.7234375178813934, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 547.283349609375, "epoch": 0.6893903024483917, "grad_norm": 0.3231147229671478, "kl": 0.4204250156879425, "learning_rate": 5.3227267702273625e-06, "loss": 0.1248, "reward": 1.7630208611488343, "reward_std": 0.25813994109630584, "rewards/accuracy_reward": 0.07500000167638063, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7359375178813934, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 576.6062744140625, "epoch": 0.6897103536565851, "grad_norm": 0.30079278349876404, "kl": 0.36308655291795733, "learning_rate": 5.312851470865383e-06, "loss": 0.1368, "reward": 1.733333373069763, "reward_std": 0.26057887747883796, "rewards/accuracy_reward": 0.054166669771075246, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7333333492279053, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 566.1395965576172, "epoch": 0.6900304048647784, "grad_norm": 0.2262878268957138, "kl": 0.3142340861260891, "learning_rate": 5.30298202596181e-06, "loss": 0.1499, "reward": 1.778125035762787, "reward_std": 0.2502134948968887, "rewards/accuracy_reward": 0.09583333544433117, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7343750238418579, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 547.9521118164063, "epoch": 0.6903504560729716, "grad_norm": 0.21603688597679138, "kl": 0.2521523617208004, "learning_rate": 5.293118447844023e-06, "loss": 0.0752, "reward": 1.7458333611488341, "reward_std": 0.16236398369073868, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7416666865348815, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 544.2062683105469, "epoch": 0.690670507281165, "grad_norm": 0.17151077091693878, "kl": 0.4505059730261564, "learning_rate": 5.283260748832072e-06, "loss": 0.1216, "reward": 1.825520896911621, "reward_std": 0.2066534325480461, "rewards/accuracy_reward": 0.13333333730697633, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.735937523841858, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 548.445849609375, "epoch": 0.6909905584893583, "grad_norm": 0.20686721801757812, "kl": 0.38486229777336123, "learning_rate": 5.2734089412386646e-06, "loss": 0.0973, "reward": 1.7302083611488341, "reward_std": 0.193883565813303, "rewards/accuracy_reward": 0.03333333469927311, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7385416865348816, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 551.3666870117188, "epoch": 0.6913106096975516, "grad_norm": 0.16719911992549896, "kl": 0.39740680269896983, "learning_rate": 5.26356303736915e-06, "loss": 0.0701, "reward": 1.7463542103767395, "reward_std": 0.20027967020869256, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7338541805744171, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 534.6208526611329, "epoch": 0.6916306609057449, "grad_norm": 0.1335284560918808, "kl": 0.22143305391073226, "learning_rate": 5.253723049521507e-06, "loss": 0.0538, "reward": 1.8437500596046448, "reward_std": 0.19260309934616088, "rewards/accuracy_reward": 0.13333333805203437, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666746139526, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 534.5687683105468, "epoch": 0.6919507121139382, "grad_norm": 0.11407246440649033, "kl": 0.3099120303988457, "learning_rate": 5.243888989986312e-06, "loss": 0.1122, "reward": 1.8453125357627869, "reward_std": 0.2336819589138031, "rewards/accuracy_reward": 0.1354166692122817, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7390625178813934, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 563.5437683105469, "epoch": 0.6922707633221316, "grad_norm": 0.26538652181625366, "kl": 0.1981400392949581, "learning_rate": 5.234060871046751e-06, "loss": 0.0868, "reward": 1.7760417222976685, "reward_std": 0.2001117028295994, "rewards/accuracy_reward": 0.06666666828095913, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.7406250238418579, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 537.464599609375, "epoch": 0.6925908145303249, "grad_norm": 0.12414728850126266, "kl": 0.18391840681433677, "learning_rate": 5.224238704978584e-06, "loss": 0.0712, "reward": 1.7281250357627869, "reward_std": 0.17280828654766084, "rewards/accuracy_reward": 0.016666666977107523, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.740625011920929, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 540.6791839599609, "epoch": 0.6929108657385181, "grad_norm": 0.15423992276191711, "kl": 0.2515277363359928, "learning_rate": 5.2144225040501375e-06, "loss": 0.0848, "reward": 1.7729167103767396, "reward_std": 0.1920185036957264, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 544.2208557128906, "epoch": 0.6932309169467115, "grad_norm": 0.20855121314525604, "kl": 0.4149386554956436, "learning_rate": 5.2046122805222845e-06, "loss": 0.1062, "reward": 1.7333333492279053, "reward_std": 0.234011735022068, "rewards/accuracy_reward": 0.04583333358168602, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7375000298023224, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 545.0125152587891, "epoch": 0.6935509681549048, "grad_norm": 0.17261765897274017, "kl": 0.31774858236312864, "learning_rate": 5.194808046648434e-06, "loss": 0.0679, "reward": 1.743750011920929, "reward_std": 0.19351360499858855, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7354166865348816, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 538.5458435058594, "epoch": 0.6938710193630981, "grad_norm": 0.1321432739496231, "kl": 0.27640740275382997, "learning_rate": 5.185009814674513e-06, "loss": 0.1257, "reward": 1.7958333849906922, "reward_std": 0.2482768900692463, "rewards/accuracy_reward": 0.10208333749324083, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7354166805744171, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 551.337515258789, "epoch": 0.6941910705712914, "grad_norm": 0.13096804916858673, "kl": 0.26496610417962074, "learning_rate": 5.175217596838956e-06, "loss": 0.0941, "reward": 1.7447916984558105, "reward_std": 0.18921037912368774, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7364583551883698, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 562.9062805175781, "epoch": 0.6945111217794847, "grad_norm": 0.11738302558660507, "kl": 0.23022876232862471, "learning_rate": 5.165431405372674e-06, "loss": 0.0782, "reward": 1.7848958492279052, "reward_std": 0.21770973801612853, "rewards/accuracy_reward": 0.07708333358168602, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7411458551883697, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 553.9062622070312, "epoch": 0.694831172987678, "grad_norm": 0.16358114778995514, "kl": 0.3719410330057144, "learning_rate": 5.1556512524990636e-06, "loss": 0.0994, "reward": 1.7494791984558105, "reward_std": 0.2786561943590641, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7348958551883698, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 561.7125213623046, "epoch": 0.6951512241958714, "grad_norm": 0.21294313669204712, "kl": 0.2740899085998535, "learning_rate": 5.145877150433967e-06, "loss": 0.1084, "reward": 1.7979166984558106, "reward_std": 0.23426204025745392, "rewards/accuracy_reward": 0.10416667088866234, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7354166805744171, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 552.1500183105469, "epoch": 0.6954712754040646, "grad_norm": 0.1394650638103485, "kl": 0.3359750546514988, "learning_rate": 5.1361091113856875e-06, "loss": 0.1008, "reward": 1.7921875357627868, "reward_std": 0.15876233726739883, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7421875178813935, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 539.2875061035156, "epoch": 0.695791326612258, "grad_norm": 0.10760974884033203, "kl": 0.25749983713030816, "learning_rate": 5.126347147554936e-06, "loss": 0.0922, "reward": 1.7937500476837158, "reward_std": 0.18401784151792527, "rewards/accuracy_reward": 0.08958333600312471, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7375000178813934, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 532.1146026611328, "epoch": 0.6961113778204513, "grad_norm": 0.21234045922756195, "kl": 0.38290523290634154, "learning_rate": 5.116591271134839e-06, "loss": 0.0837, "reward": 1.7427083849906921, "reward_std": 0.17233860194683076, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7385416865348816, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 542.0083557128906, "epoch": 0.6964314290286446, "grad_norm": 0.29094812273979187, "kl": 0.4088446289300919, "learning_rate": 5.106841494310929e-06, "loss": 0.0907, "reward": 1.7270833730697632, "reward_std": 0.20447328686714172, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9562500298023224, "rewards/tag_count_reward": 0.7354166805744171, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 547.1562744140625, "epoch": 0.6967514802368379, "grad_norm": 0.24387161433696747, "kl": 0.5301667034626008, "learning_rate": 5.097097829261115e-06, "loss": 0.0933, "reward": 1.7411458969116211, "reward_std": 0.19163210839033126, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7369791865348816, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 524.839599609375, "epoch": 0.6970715314450312, "grad_norm": 0.31514298915863037, "kl": 0.340422347933054, "learning_rate": 5.087360288155664e-06, "loss": 0.098, "reward": 1.7343750596046448, "reward_std": 0.24370125085115432, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583492279053, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 553.7562713623047, "epoch": 0.6973915826532245, "grad_norm": 0.09657612442970276, "kl": 0.2676196489483118, "learning_rate": 5.077628883157205e-06, "loss": 0.0499, "reward": 1.756250023841858, "reward_std": 0.12914905995130538, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7437500119209289, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 555.1396087646484, "epoch": 0.6977116338614179, "grad_norm": 0.24548500776290894, "kl": 0.45806365422904494, "learning_rate": 5.067903626420698e-06, "loss": 0.1327, "reward": 1.795312523841858, "reward_std": 0.2129717141389847, "rewards/accuracy_reward": 0.10208333730697632, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7328125178813935, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 554.3729370117187, "epoch": 0.6980316850696111, "grad_norm": 0.15102331340312958, "kl": 0.37719000279903414, "learning_rate": 5.058184530093424e-06, "loss": 0.0906, "reward": 1.7322916865348816, "reward_std": 0.19234516769647597, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7385416865348816, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 578.0646057128906, "epoch": 0.6983517362778044, "grad_norm": 0.1571054458618164, "kl": 0.33296659886837005, "learning_rate": 5.048471606314971e-06, "loss": 0.0758, "reward": 1.7744791865348817, "reward_std": 0.17921509444713593, "rewards/accuracy_reward": 0.07083333786576987, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.739062511920929, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 567.8687622070313, "epoch": 0.6986717874859978, "grad_norm": 0.33309119939804077, "kl": 0.4173093684017658, "learning_rate": 5.038764867217214e-06, "loss": 0.0955, "reward": 1.7833333730697631, "reward_std": 0.20730995163321495, "rewards/accuracy_reward": 0.08333333749324083, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7375000238418579, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 556.502099609375, "epoch": 0.6989918386941911, "grad_norm": 0.15169481933116913, "kl": 0.22802985832095146, "learning_rate": 5.0290643249243065e-06, "loss": 0.0639, "reward": 1.760937511920929, "reward_std": 0.13749925792217255, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7401041865348816, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 545.8312683105469, "epoch": 0.6993118899023844, "grad_norm": 0.17143678665161133, "kl": 0.23197558745741845, "learning_rate": 5.019369991552658e-06, "loss": 0.0367, "reward": 1.8010416984558106, "reward_std": 0.08791158646345139, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9854166746139527, "rewards/tag_count_reward": 0.7447916805744171, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 526.4062683105469, "epoch": 0.6996319411105777, "grad_norm": 0.05793720856308937, "kl": 0.16285086199641227, "learning_rate": 5.00968187921093e-06, "loss": 0.045, "reward": 1.848437523841858, "reward_std": 0.1568957671523094, "rewards/accuracy_reward": 0.12083333637565374, "rewards/format_reward": 0.9833333432674408, "rewards/tag_count_reward": 0.7442708432674408, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 578.0666931152343, "epoch": 0.699951992318771, "grad_norm": 0.42837584018707275, "kl": 0.21833952143788338, "learning_rate": 5.000000000000003e-06, "loss": 0.0511, "reward": 1.7682292103767394, "reward_std": 0.12465300261974335, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7411458492279053, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 570.4041870117187, "epoch": 0.7002720435269643, "grad_norm": 0.08884342014789581, "kl": 0.3156105622649193, "learning_rate": 4.990324366012977e-06, "loss": 0.0828, "reward": 1.7864583492279054, "reward_std": 0.1695403054356575, "rewards/accuracy_reward": 0.07916666995733976, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.740625011920929, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 569.4437622070312, "epoch": 0.7005920947351576, "grad_norm": 0.23606713116168976, "kl": 0.1369132250547409, "learning_rate": 4.980654989335156e-06, "loss": 0.0579, "reward": 1.8208333611488343, "reward_std": 0.10346375182271003, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7458333551883698, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 584.3271057128907, "epoch": 0.7009121459433509, "grad_norm": 0.13106216490268707, "kl": 0.3166439961642027, "learning_rate": 4.970991882044024e-06, "loss": 0.0709, "reward": 1.7213541746139527, "reward_std": 0.14932389408349991, "rewards/accuracy_reward": 0.018750001117587088, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7401041865348816, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 566.6271026611328, "epoch": 0.7012321971515443, "grad_norm": 0.19997036457061768, "kl": 0.41348587200045583, "learning_rate": 4.961335056209234e-06, "loss": 0.0585, "reward": 1.7145833611488341, "reward_std": 0.1433960720896721, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7395833551883697, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 566.4062683105469, "epoch": 0.7015522483597375, "grad_norm": 0.20145119726657867, "kl": 0.4204010270535946, "learning_rate": 4.9516845238925926e-06, "loss": 0.1152, "reward": 1.7380208849906922, "reward_std": 0.23682481199502944, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7338541805744171, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 545.3208557128906, "epoch": 0.7018722995679308, "grad_norm": 0.1150142028927803, "kl": 0.34649395793676374, "learning_rate": 4.942040297148049e-06, "loss": 0.0949, "reward": 1.8109375596046449, "reward_std": 0.2290053188800812, "rewards/accuracy_reward": 0.11875000298023224, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7359375178813934, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 566.8625122070313, "epoch": 0.7021923507761242, "grad_norm": 0.20946632325649261, "kl": 0.3540042258799076, "learning_rate": 4.932402388021677e-06, "loss": 0.1114, "reward": 1.7692708611488341, "reward_std": 0.24689008817076682, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7401041924953461, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 555.9604431152344, "epoch": 0.7025124019843175, "grad_norm": 0.21829591691493988, "kl": 0.3825660213828087, "learning_rate": 4.922770808551649e-06, "loss": 0.0676, "reward": 1.7973958849906921, "reward_std": 0.21412076726555823, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.9583333671092987, "rewards/tag_count_reward": 0.7369791865348816, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 542.0666809082031, "epoch": 0.7028324531925108, "grad_norm": 0.21612143516540527, "kl": 0.23135574162006378, "learning_rate": 4.913145570768243e-06, "loss": 0.1101, "reward": 1.7859375476837158, "reward_std": 0.17264212369918824, "rewards/accuracy_reward": 0.07916666902601718, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7380208611488343, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 569.3833618164062, "epoch": 0.7031525044007041, "grad_norm": 0.10298977792263031, "kl": 0.20634137317538262, "learning_rate": 4.9035266866938125e-06, "loss": 0.0891, "reward": 1.8015625476837158, "reward_std": 0.23462976813316344, "rewards/accuracy_reward": 0.10625000149011612, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7369791865348816, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 572.5396057128906, "epoch": 0.7034725556088974, "grad_norm": 0.17319689691066742, "kl": 0.4252075083553791, "learning_rate": 4.8939141683427735e-06, "loss": 0.1317, "reward": 1.7807291865348815, "reward_std": 0.28460691273212435, "rewards/accuracy_reward": 0.09583333767950535, "rewards/format_reward": 0.9500000357627869, "rewards/tag_count_reward": 0.7348958492279053, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 558.5041809082031, "epoch": 0.7037926068170908, "grad_norm": 0.37925511598587036, "kl": 0.25806584507226943, "learning_rate": 4.884308027721593e-06, "loss": 0.1236, "reward": 1.7067708730697633, "reward_std": 0.16876881793141366, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7421875238418579, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 583.7271057128906, "epoch": 0.704112658025284, "grad_norm": 0.1057218387722969, "kl": 0.24062796980142592, "learning_rate": 4.87470827682877e-06, "loss": 0.0878, "reward": 1.7885417222976685, "reward_std": 0.19179236218333245, "rewards/accuracy_reward": 0.08125000260770321, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7406250178813935, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 566.097933959961, "epoch": 0.7044327092334773, "grad_norm": 0.1587941199541092, "kl": 0.3910080552101135, "learning_rate": 4.865114927654824e-06, "loss": 0.1293, "reward": 1.8296875357627869, "reward_std": 0.24853597730398178, "rewards/accuracy_reward": 0.1500000050291419, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7338541865348815, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 577.4812683105469, "epoch": 0.7047527604416707, "grad_norm": 0.16094599664211273, "kl": 0.3398930035531521, "learning_rate": 4.855527992182281e-06, "loss": 0.09, "reward": 1.7609375238418579, "reward_std": 0.21207387149333953, "rewards/accuracy_reward": 0.06250000335276126, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.735937523841858, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 577.9229309082032, "epoch": 0.705072811649864, "grad_norm": 0.32247263193130493, "kl": 0.5623447112739086, "learning_rate": 4.8459474823856445e-06, "loss": 0.11, "reward": 1.6973958730697631, "reward_std": 0.22016318514943123, "rewards/accuracy_reward": 0.018750000186264514, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7307291924953461, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 596.1750244140625, "epoch": 0.7053928628580572, "grad_norm": 0.2458748072385788, "kl": 0.33539998829364776, "learning_rate": 4.836373410231405e-06, "loss": 0.1112, "reward": 1.7250000357627868, "reward_std": 0.2504465445876122, "rewards/accuracy_reward": 0.0479166692122817, "rewards/format_reward": 0.9395833671092987, "rewards/tag_count_reward": 0.7375000178813934, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 581.4750244140625, "epoch": 0.7057129140662506, "grad_norm": 0.16893352568149567, "kl": 0.3260126397013664, "learning_rate": 4.8268057876780075e-06, "loss": 0.0876, "reward": 1.7765625357627868, "reward_std": 0.2559411033987999, "rewards/accuracy_reward": 0.07500000186264515, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7390625178813934, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 598.6479248046875, "epoch": 0.7060329652744439, "grad_norm": 0.21566948294639587, "kl": 0.4895564019680023, "learning_rate": 4.81724462667584e-06, "loss": 0.1139, "reward": 1.6729167103767395, "reward_std": 0.22808781638741493, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7270833492279053, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 561.2021057128907, "epoch": 0.7063530164826373, "grad_norm": 0.13912886381149292, "kl": 0.29945429414510727, "learning_rate": 4.807689939167222e-06, "loss": 0.0767, "reward": 1.7656250476837159, "reward_std": 0.17697276175022125, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7406250178813935, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 571.4666900634766, "epoch": 0.7066730676908305, "grad_norm": 0.19941870868206024, "kl": 0.3455564148724079, "learning_rate": 4.798141737086384e-06, "loss": 0.1115, "reward": 1.90208340883255, "reward_std": 0.1985380657017231, "rewards/accuracy_reward": 0.202083339355886, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.737500011920929, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 578.191683959961, "epoch": 0.7069931188990238, "grad_norm": 0.32233765721321106, "kl": 0.278411491215229, "learning_rate": 4.788600032359461e-06, "loss": 0.0535, "reward": 1.825000023841858, "reward_std": 0.16070781499147416, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7395833492279053, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 571.3521026611328, "epoch": 0.7073131701072172, "grad_norm": 0.1364719420671463, "kl": 0.49797850996255877, "learning_rate": 4.77906483690447e-06, "loss": 0.1008, "reward": 1.7989583611488342, "reward_std": 0.2100510597229004, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7343750298023224, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 567.9500274658203, "epoch": 0.7076332213154105, "grad_norm": 0.20895767211914062, "kl": 0.31710937693715097, "learning_rate": 4.769536162631292e-06, "loss": 0.0932, "reward": 1.8171875476837158, "reward_std": 0.19292281717061996, "rewards/accuracy_reward": 0.11250000409781932, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7421875238418579, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 585.5812683105469, "epoch": 0.7079532725236037, "grad_norm": 0.12749753892421722, "kl": 0.40187211334705353, "learning_rate": 4.760014021441671e-06, "loss": 0.1104, "reward": 1.7026042103767396, "reward_std": 0.2228372722864151, "rewards/accuracy_reward": 0.016666667349636555, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7380208671092987, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 571.4541839599609, "epoch": 0.7082733237317971, "grad_norm": 0.18957151472568512, "kl": 0.18363816738128663, "learning_rate": 4.750498425229188e-06, "loss": 0.0541, "reward": 1.8494792342185975, "reward_std": 0.22633131146430968, "rewards/accuracy_reward": 0.13750000670552254, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7432291805744171, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 583.0437683105469, "epoch": 0.7085933749399904, "grad_norm": 0.14039163291454315, "kl": 0.4518029972910881, "learning_rate": 4.740989385879248e-06, "loss": 0.0622, "reward": 1.8098958492279054, "reward_std": 0.23249467983841896, "rewards/accuracy_reward": 0.11666667088866234, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7348958551883698, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 591.1229309082031, "epoch": 0.7089134261481838, "grad_norm": 0.1878720074892044, "kl": 0.1566619262099266, "learning_rate": 4.731486915269066e-06, "loss": 0.0492, "reward": 1.8062500357627869, "reward_std": 0.13992664515972136, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7458333492279052, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 576.652099609375, "epoch": 0.709233477356377, "grad_norm": 0.11067891865968704, "kl": 0.2782533496618271, "learning_rate": 4.721991025267657e-06, "loss": 0.0475, "reward": 1.8005208849906922, "reward_std": 0.21060936525464058, "rewards/accuracy_reward": 0.09375000391155482, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7401041865348816, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 577.7771087646485, "epoch": 0.7095535285645703, "grad_norm": 0.2800833582878113, "kl": 0.2611836478114128, "learning_rate": 4.712501727735808e-06, "loss": 0.1179, "reward": 1.8302083730697631, "reward_std": 0.2340967148542404, "rewards/accuracy_reward": 0.1395833384245634, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7343750178813935, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 582.495849609375, "epoch": 0.7098735797727637, "grad_norm": 0.11880511045455933, "kl": 0.31443496271967886, "learning_rate": 4.703019034526082e-06, "loss": 0.0788, "reward": 1.7661458492279052, "reward_std": 0.18282609283924103, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7390625178813934, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 611.8687683105469, "epoch": 0.710193630980957, "grad_norm": 0.14947304129600525, "kl": 0.4382303521037102, "learning_rate": 4.693542957482786e-06, "loss": 0.0875, "reward": 1.7578125238418578, "reward_std": 0.22819079458713531, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7348958551883698, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 583.245849609375, "epoch": 0.7105136821891502, "grad_norm": 0.23497365415096283, "kl": 0.28563379757106305, "learning_rate": 4.684073508441961e-06, "loss": 0.0807, "reward": 1.8682292342185973, "reward_std": 0.1950744114816189, "rewards/accuracy_reward": 0.17708334028720857, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7411458432674408, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 581.0354309082031, "epoch": 0.7108337333973436, "grad_norm": 0.09888350963592529, "kl": 0.2779009331017733, "learning_rate": 4.674610699231373e-06, "loss": 0.0489, "reward": 1.7447916984558105, "reward_std": 0.18470503836870195, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7364583551883698, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 553.5125152587891, "epoch": 0.7111537846055369, "grad_norm": 0.23047621548175812, "kl": 0.2806026488542557, "learning_rate": 4.665154541670498e-06, "loss": 0.1104, "reward": 1.7427083611488343, "reward_std": 0.20498983785510064, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7364583492279053, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 547.8708557128906, "epoch": 0.7114738358137302, "grad_norm": 0.19911795854568481, "kl": 0.13476757146418095, "learning_rate": 4.655705047570498e-06, "loss": 0.0467, "reward": 1.8401041984558106, "reward_std": 0.1357252113521099, "rewards/accuracy_reward": 0.11875000353902579, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7442708432674408, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 598.1687744140625, "epoch": 0.7117938870219235, "grad_norm": 0.16080540418624878, "kl": 0.5152991503477097, "learning_rate": 4.6462622287342154e-06, "loss": 0.122, "reward": 1.7302083849906922, "reward_std": 0.23456745445728303, "rewards/accuracy_reward": 0.04583333544433117, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7427083551883698, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 594.8166870117187, "epoch": 0.7121139382301168, "grad_norm": 0.2366468459367752, "kl": 0.670868530869484, "learning_rate": 4.636826096956153e-06, "loss": 0.1181, "reward": 1.7375000357627868, "reward_std": 0.2714173913002014, "rewards/accuracy_reward": 0.05000000223517418, "rewards/format_reward": 0.9520833611488342, "rewards/tag_count_reward": 0.7354166865348816, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 575.1687713623047, "epoch": 0.7124339894383102, "grad_norm": 0.19582653045654297, "kl": 0.33867536932229997, "learning_rate": 4.627396664022462e-06, "loss": 0.0905, "reward": 1.806250023841858, "reward_std": 0.23281241804361344, "rewards/accuracy_reward": 0.10416667144745588, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.737500011920929, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 574.6937713623047, "epoch": 0.7127540406465035, "grad_norm": 0.20600047707557678, "kl": 0.41766203939914703, "learning_rate": 4.617973941710932e-06, "loss": 0.0986, "reward": 1.7557292222976684, "reward_std": 0.2621150605380535, "rewards/accuracy_reward": 0.0729166679084301, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7348958551883698, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 604.0687744140625, "epoch": 0.7130740918546967, "grad_norm": 0.1924317628145218, "kl": 0.4410012990236282, "learning_rate": 4.608557941790954e-06, "loss": 0.0809, "reward": 1.7578125238418578, "reward_std": 0.22074405550956727, "rewards/accuracy_reward": 0.05416666846722364, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7411458611488342, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 584.3833557128906, "epoch": 0.7133941430628901, "grad_norm": 0.2038911134004593, "kl": 0.3418886814266443, "learning_rate": 4.5991486760235404e-06, "loss": 0.0756, "reward": 1.9088542103767394, "reward_std": 0.19126099869608879, "rewards/accuracy_reward": 0.20416667088866233, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7380208551883698, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 550.6812713623046, "epoch": 0.7137141942710834, "grad_norm": 0.33020251989364624, "kl": 0.2924545969814062, "learning_rate": 4.5897461561612814e-06, "loss": 0.0629, "reward": 1.8562500476837158, "reward_std": 0.16590869426727295, "rewards/accuracy_reward": 0.135416672937572, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7437500178813934, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 558.402099609375, "epoch": 0.7140342454792766, "grad_norm": 0.20029504597187042, "kl": 0.37912337966263293, "learning_rate": 4.580350393948355e-06, "loss": 0.0894, "reward": 1.82239590883255, "reward_std": 0.15126769095659257, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7390625178813934, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 582.1562683105469, "epoch": 0.71435429668747, "grad_norm": 0.1418682336807251, "kl": 0.33628618270158767, "learning_rate": 4.5709614011204794e-06, "loss": 0.124, "reward": 1.8239583849906922, "reward_std": 0.2332266129553318, "rewards/accuracy_reward": 0.14166666865348815, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7364583551883698, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 586.0396057128906, "epoch": 0.7146743478956633, "grad_norm": 0.17476728558540344, "kl": 0.34939279705286025, "learning_rate": 4.561579189404929e-06, "loss": 0.0956, "reward": 1.7447917103767394, "reward_std": 0.24993923604488372, "rewards/accuracy_reward": 0.056250001303851604, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7343750298023224, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 566.0020935058594, "epoch": 0.7149943991038566, "grad_norm": 0.14975003898143768, "kl": 0.3478496439754963, "learning_rate": 4.552203770520508e-06, "loss": 0.0921, "reward": 1.7145833611488341, "reward_std": 0.14527590125799178, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.737500011920929, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 594.6812622070313, "epoch": 0.7153144503120499, "grad_norm": 0.2850889563560486, "kl": 0.3510056212544441, "learning_rate": 4.542835156177537e-06, "loss": 0.0883, "reward": 1.7458333730697633, "reward_std": 0.16523813009262084, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7416666865348815, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 596.1396057128907, "epoch": 0.7156345015202432, "grad_norm": 0.15181845426559448, "kl": 0.4017877370119095, "learning_rate": 4.5334733580778305e-06, "loss": 0.128, "reward": 1.6812500357627869, "reward_std": 0.27939036712050436, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7354166865348816, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 541.2750244140625, "epoch": 0.7159545527284366, "grad_norm": 0.15312950313091278, "kl": 0.3317554071545601, "learning_rate": 4.5241183879146926e-06, "loss": 0.0944, "reward": 1.7895833849906921, "reward_std": 0.2161307230591774, "rewards/accuracy_reward": 0.08958333618938923, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7375000178813934, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 583.0541900634765, "epoch": 0.7162746039366299, "grad_norm": 0.21131104230880737, "kl": 0.3239736631512642, "learning_rate": 4.5147702573729015e-06, "loss": 0.108, "reward": 1.8375000476837158, "reward_std": 0.3055271409451962, "rewards/accuracy_reward": 0.15625000558793545, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7312500298023223, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 565.0250213623046, "epoch": 0.7165946551448231, "grad_norm": 0.2556743621826172, "kl": 0.29245643988251685, "learning_rate": 4.505428978128693e-06, "loss": 0.0769, "reward": 1.7364583730697631, "reward_std": 0.16923416927456855, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.740625011920929, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 572.1437744140625, "epoch": 0.7169147063530165, "grad_norm": 0.2496984452009201, "kl": 0.4537731699645519, "learning_rate": 4.496094561849741e-06, "loss": 0.117, "reward": 1.844270896911621, "reward_std": 0.22066160291433334, "rewards/accuracy_reward": 0.1500000050291419, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7380208551883698, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 601.0208618164063, "epoch": 0.7172347575612098, "grad_norm": 0.23430490493774414, "kl": 0.41818406283855436, "learning_rate": 4.486767020195151e-06, "loss": 0.0875, "reward": 1.7734375715255737, "reward_std": 0.24914529621601106, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7338541805744171, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 538.6146087646484, "epoch": 0.7175548087694031, "grad_norm": 0.16890211403369904, "kl": 0.4942522332072258, "learning_rate": 4.4774463648154375e-06, "loss": 0.1103, "reward": 1.7401042222976684, "reward_std": 0.21844895631074907, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 584.2750213623046, "epoch": 0.7178748599775964, "grad_norm": 0.12553976476192474, "kl": 0.27406597658991816, "learning_rate": 4.46813260735252e-06, "loss": 0.1205, "reward": 1.756250023841858, "reward_std": 0.24123900830745698, "rewards/accuracy_reward": 0.06875000260770321, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7375000178813934, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 576.3750244140625, "epoch": 0.7181949111857897, "grad_norm": 0.1890048384666443, "kl": 0.30552619621157645, "learning_rate": 4.4588257594397e-06, "loss": 0.0785, "reward": 1.725000023841858, "reward_std": 0.18632035106420516, "rewards/accuracy_reward": 0.025, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7375000238418579, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 575.6854309082031, "epoch": 0.718514962393983, "grad_norm": 0.19801272451877594, "kl": 0.32198435068130493, "learning_rate": 4.4495258327016415e-06, "loss": 0.0965, "reward": 1.8354167342185974, "reward_std": 0.2720773681998253, "rewards/accuracy_reward": 0.14583333916962146, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7333333551883697, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 580.7771057128906, "epoch": 0.7188350136021764, "grad_norm": 0.14209610223770142, "kl": 0.2674247484654188, "learning_rate": 4.44023283875437e-06, "loss": 0.0892, "reward": 1.8104167103767395, "reward_std": 0.20731791108846664, "rewards/accuracy_reward": 0.11041666902601718, "rewards/format_reward": 0.9604166746139526, "rewards/tag_count_reward": 0.7395833492279053, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 556.7958526611328, "epoch": 0.7191550648103696, "grad_norm": 0.2517361044883728, "kl": 0.29758902490139005, "learning_rate": 4.430946789205255e-06, "loss": 0.1018, "reward": 1.7119791746139525, "reward_std": 0.22636782750487328, "rewards/accuracy_reward": 0.027083333395421506, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7307291865348816, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 578.150015258789, "epoch": 0.719475116018563, "grad_norm": 0.16266785562038422, "kl": 0.2869046814739704, "learning_rate": 4.421667695652987e-06, "loss": 0.0691, "reward": 1.732812523841858, "reward_std": 0.20949894338846206, "rewards/accuracy_reward": 0.0375000013038516, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7390625178813934, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 565.993765258789, "epoch": 0.7197951672267563, "grad_norm": 0.09789406508207321, "kl": 0.16912736520171165, "learning_rate": 4.412395569687568e-06, "loss": 0.0375, "reward": 1.7848958611488341, "reward_std": 0.15943676605820656, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7432291746139527, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 568.8875183105469, "epoch": 0.7201152184349496, "grad_norm": 0.38918501138687134, "kl": 0.31766297519207, "learning_rate": 4.403130422890299e-06, "loss": 0.0914, "reward": 1.7619791865348815, "reward_std": 0.23368189930915834, "rewards/accuracy_reward": 0.06666666716337204, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7369791865348816, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 577.5479248046875, "epoch": 0.7204352696431429, "grad_norm": 0.15137572586536407, "kl": 0.4091979868710041, "learning_rate": 4.393872266833764e-06, "loss": 0.1025, "reward": 1.784375047683716, "reward_std": 0.1803455211222172, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7385416805744172, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 565.0750244140625, "epoch": 0.7207553208513362, "grad_norm": 0.4936150014400482, "kl": 0.4002502106130123, "learning_rate": 4.3846211130818185e-06, "loss": 0.1167, "reward": 1.7380208611488341, "reward_std": 0.23237907737493516, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7338541865348815, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 566.0875183105469, "epoch": 0.7210753720595295, "grad_norm": 0.18475115299224854, "kl": 0.27582458928227427, "learning_rate": 4.375376973189559e-06, "loss": 0.0916, "reward": 1.7911458373069764, "reward_std": 0.19789444506168366, "rewards/accuracy_reward": 0.07708333488553762, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7453125238418579, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 572.302099609375, "epoch": 0.7213954232677229, "grad_norm": 0.3201963007450104, "kl": 0.3628829248249531, "learning_rate": 4.3661398587033355e-06, "loss": 0.0856, "reward": 1.8302083730697631, "reward_std": 0.17892763316631316, "rewards/accuracy_reward": 0.12291667032986879, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7406250238418579, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 570.0312683105469, "epoch": 0.7217154744759161, "grad_norm": 0.29653018712997437, "kl": 0.3145171828567982, "learning_rate": 4.356909781160716e-06, "loss": 0.1128, "reward": 1.8046875715255737, "reward_std": 0.21797730773687363, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 543.9687683105469, "epoch": 0.7220355256841094, "grad_norm": 0.22039726376533508, "kl": 0.4378112189471722, "learning_rate": 4.347686752090482e-06, "loss": 0.1191, "reward": 1.7296875596046448, "reward_std": 0.2500815257430077, "rewards/accuracy_reward": 0.047916668094694616, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7338541865348815, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 553.7437744140625, "epoch": 0.7223555768923028, "grad_norm": 0.27176499366760254, "kl": 0.49834114536643026, "learning_rate": 4.338470783012609e-06, "loss": 0.1389, "reward": 1.7661458849906921, "reward_std": 0.2512555614113808, "rewards/accuracy_reward": 0.08333333563059568, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7286458492279053, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 555.535433959961, "epoch": 0.7226756281004961, "grad_norm": 0.15212570130825043, "kl": 0.410233548656106, "learning_rate": 4.3292618854382564e-06, "loss": 0.1073, "reward": 1.7916666984558105, "reward_std": 0.20430766493082048, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7333333492279053, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 550.9896026611328, "epoch": 0.7229956793086894, "grad_norm": 0.21144744753837585, "kl": 0.49212879687547684, "learning_rate": 4.320060070869747e-06, "loss": 0.1128, "reward": 1.7947917222976684, "reward_std": 0.2727638013660908, "rewards/accuracy_reward": 0.11458333786576987, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7322916865348816, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 582.808349609375, "epoch": 0.7233157305168827, "grad_norm": 0.26423129439353943, "kl": 0.5360765296965837, "learning_rate": 4.310865350800566e-06, "loss": 0.1251, "reward": 1.7338542103767396, "reward_std": 0.2479950025677681, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.9270833492279053, "rewards/tag_count_reward": 0.723437511920929, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 581.3375244140625, "epoch": 0.723635781725076, "grad_norm": 0.48073655366897583, "kl": 0.543912273645401, "learning_rate": 4.3016777367153206e-06, "loss": 0.1057, "reward": 1.7390625476837158, "reward_std": 0.24218678548932077, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7307291924953461, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 579.2375122070313, "epoch": 0.7239558329332694, "grad_norm": 0.27856066823005676, "kl": 0.48950769305229186, "learning_rate": 4.292497240089758e-06, "loss": 0.1204, "reward": 1.7489583849906922, "reward_std": 0.23832616060972214, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.9458333611488342, "rewards/tag_count_reward": 0.7302083551883698, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 556.6250152587891, "epoch": 0.7242758841414626, "grad_norm": 0.2674890458583832, "kl": 0.5480339720845222, "learning_rate": 4.283323872390728e-06, "loss": 0.1184, "reward": 1.7369791984558105, "reward_std": 0.24941499531269073, "rewards/accuracy_reward": 0.060416667722165585, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7307291984558105, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 564.045849609375, "epoch": 0.7245959353496559, "grad_norm": 0.12551912665367126, "kl": 0.32917521223425866, "learning_rate": 4.274157645076179e-06, "loss": 0.0759, "reward": 1.789062535762787, "reward_std": 0.2022472068667412, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7369791865348816, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 542.833349609375, "epoch": 0.7249159865578493, "grad_norm": 0.16230449080467224, "kl": 0.269290691614151, "learning_rate": 4.264998569595138e-06, "loss": 0.0549, "reward": 1.8661459326744079, "reward_std": 0.19540103599429132, "rewards/accuracy_reward": 0.15833333861082793, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7411458551883697, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 545.0396118164062, "epoch": 0.7252360377660426, "grad_norm": 0.17532478272914886, "kl": 0.3482740193605423, "learning_rate": 4.255846657387701e-06, "loss": 0.0833, "reward": 1.7963541865348815, "reward_std": 0.2054486319422722, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7380208492279052, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 544.8541870117188, "epoch": 0.7255560889742358, "grad_norm": 0.11146936565637589, "kl": 0.29559036940336225, "learning_rate": 4.246701919885017e-06, "loss": 0.0928, "reward": 1.7734375238418578, "reward_std": 0.20702899396419525, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7359375178813934, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 580.8333679199219, "epoch": 0.7258761401824292, "grad_norm": 0.13458676636219025, "kl": 0.368284372985363, "learning_rate": 4.2375643685092745e-06, "loss": 0.0907, "reward": 1.7427083849906921, "reward_std": 0.19133371710777283, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7343750238418579, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 544.6687652587891, "epoch": 0.7261961913906225, "grad_norm": 0.15233393013477325, "kl": 0.19312002062797545, "learning_rate": 4.228434014673679e-06, "loss": 0.0697, "reward": 1.7552083611488343, "reward_std": 0.14367434456944467, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7427083551883698, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 556.3854339599609, "epoch": 0.7265162425988159, "grad_norm": 0.19035114347934723, "kl": 0.3742452569305897, "learning_rate": 4.219310869782453e-06, "loss": 0.0632, "reward": 1.7869791984558105, "reward_std": 0.17124900594353676, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.736979192495346, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 552.4958435058594, "epoch": 0.7268362938070091, "grad_norm": 0.18626444041728973, "kl": 0.29630909487605095, "learning_rate": 4.210194945230815e-06, "loss": 0.0921, "reward": 1.7609375357627868, "reward_std": 0.19042362570762633, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.735937523841858, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 565.681265258789, "epoch": 0.7271563450152024, "grad_norm": 0.18361890316009521, "kl": 0.27902993783354757, "learning_rate": 4.201086252404962e-06, "loss": 0.0884, "reward": 1.7515625596046447, "reward_std": 0.254750494658947, "rewards/accuracy_reward": 0.06666666846722365, "rewards/format_reward": 0.9520833611488342, "rewards/tag_count_reward": 0.732812511920929, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 542.4583465576172, "epoch": 0.7274763962233958, "grad_norm": 0.13005219399929047, "kl": 0.25048135630786417, "learning_rate": 4.19198480268206e-06, "loss": 0.0997, "reward": 1.7901041984558106, "reward_std": 0.25187977477908136, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7359375178813934, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 600.5375183105468, "epoch": 0.727796447431589, "grad_norm": 0.07397562265396118, "kl": 0.1687136735767126, "learning_rate": 4.1828906074302255e-06, "loss": 0.0406, "reward": 1.748437523841858, "reward_std": 0.11814125031232833, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7401041865348816, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 573.7416870117188, "epoch": 0.7281164986397823, "grad_norm": 0.10808942466974258, "kl": 0.22886997163295747, "learning_rate": 4.1738036780085175e-06, "loss": 0.0888, "reward": 1.7609375596046448, "reward_std": 0.19000280499458314, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.735937523841858, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 554.3750244140625, "epoch": 0.7284365498479757, "grad_norm": 0.20324602723121643, "kl": 0.2584953740239143, "learning_rate": 4.164724025766917e-06, "loss": 0.0639, "reward": 1.795312523841858, "reward_std": 0.19345357716083528, "rewards/accuracy_reward": 0.08333333674818277, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7432291865348816, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 563.2333618164063, "epoch": 0.728756601056169, "grad_norm": 0.15485358238220215, "kl": 0.19948984831571578, "learning_rate": 4.155651662046319e-06, "loss": 0.0708, "reward": 1.8125000476837159, "reward_std": 0.17140211313962936, "rewards/accuracy_reward": 0.09375000428408384, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.743750023841858, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 555.4083587646485, "epoch": 0.7290766522643622, "grad_norm": 0.12642821669578552, "kl": 0.30407530032098296, "learning_rate": 4.1465865981785055e-06, "loss": 0.1133, "reward": 1.7520833730697631, "reward_std": 0.22057257741689681, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7354166805744171, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 566.9625213623046, "epoch": 0.7293967034725556, "grad_norm": 0.15293292701244354, "kl": 0.3121527761220932, "learning_rate": 4.137528845486152e-06, "loss": 0.085, "reward": 1.825520896911621, "reward_std": 0.162899911403656, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7421875238418579, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 554.7583526611328, "epoch": 0.7297167546807489, "grad_norm": 0.25029483437538147, "kl": 0.4540176376700401, "learning_rate": 4.128478415282795e-06, "loss": 0.1197, "reward": 1.8125000476837159, "reward_std": 0.2490193247795105, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7312500238418579, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 576.8854431152344, "epoch": 0.7300368058889423, "grad_norm": 0.30002540349960327, "kl": 0.27474406994879247, "learning_rate": 4.11943531887283e-06, "loss": 0.0807, "reward": 1.7744791984558106, "reward_std": 0.15698974579572678, "rewards/accuracy_reward": 0.05625000111758709, "rewards/format_reward": 0.9750000238418579, "rewards/tag_count_reward": 0.7432291865348816, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 568.2104370117188, "epoch": 0.7303568570971355, "grad_norm": 0.09751484543085098, "kl": 0.2339543603360653, "learning_rate": 4.1103995675514865e-06, "loss": 0.1043, "reward": 1.756770873069763, "reward_std": 0.1833608940243721, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7421875178813935, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 559.4479431152344, "epoch": 0.7306769083053288, "grad_norm": 0.10962416976690292, "kl": 0.2689382560551167, "learning_rate": 4.101371172604823e-06, "loss": 0.0902, "reward": 1.748958373069763, "reward_std": 0.177383716404438, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7364583611488342, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 574.0396118164062, "epoch": 0.7309969595135222, "grad_norm": 0.16082532703876495, "kl": 0.17291892133653164, "learning_rate": 4.0923501453097115e-06, "loss": 0.0739, "reward": 1.7531250357627868, "reward_std": 0.14365102648735045, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7427083492279053, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 536.5771026611328, "epoch": 0.7313170107217155, "grad_norm": 0.15020890533924103, "kl": 0.2913985226303339, "learning_rate": 4.08333649693382e-06, "loss": 0.0784, "reward": 1.7869791984558105, "reward_std": 0.19107620120048524, "rewards/accuracy_reward": 0.07916666828095913, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7411458492279053, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 556.0666778564453, "epoch": 0.7316370619299087, "grad_norm": 0.17711031436920166, "kl": 0.36008758544921876, "learning_rate": 4.074330238735592e-06, "loss": 0.0677, "reward": 1.7583333611488343, "reward_std": 0.14769948348402978, "rewards/accuracy_reward": 0.0479166692122817, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7437500178813934, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 574.268765258789, "epoch": 0.7319571131381021, "grad_norm": 0.08795570582151413, "kl": 0.25694953128695486, "learning_rate": 4.065331381964252e-06, "loss": 0.0852, "reward": 1.8203125238418578, "reward_std": 0.24367085993289947, "rewards/accuracy_reward": 0.11875000391155481, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7390625178813934, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 563.7375244140625, "epoch": 0.7322771643462954, "grad_norm": 0.13169290125370026, "kl": 0.21410394608974456, "learning_rate": 4.056339937859776e-06, "loss": 0.0555, "reward": 1.8015625476837158, "reward_std": 0.1801581375300884, "rewards/accuracy_reward": 0.08125000391155482, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7453125178813934, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 559.9333587646485, "epoch": 0.7325972155544888, "grad_norm": 0.08827610313892365, "kl": 0.39082359373569486, "learning_rate": 4.047355917652877e-06, "loss": 0.0876, "reward": 1.7973958849906921, "reward_std": 0.18793805167078972, "rewards/accuracy_reward": 0.09375000316649676, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7390625238418579, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 583.5562744140625, "epoch": 0.732917266762682, "grad_norm": 0.2297748476266861, "kl": 0.28518550768494605, "learning_rate": 4.0383793325650025e-06, "loss": 0.0704, "reward": 1.7703125476837158, "reward_std": 0.18034229278564454, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7307291865348816, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 581.1250061035156, "epoch": 0.7332373179708753, "grad_norm": 0.11378385126590729, "kl": 0.3131607033312321, "learning_rate": 4.0294101938083065e-06, "loss": 0.0978, "reward": 1.7062500238418579, "reward_std": 0.18863984048366547, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7354166865348816, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 562.3500183105468, "epoch": 0.7335573691790687, "grad_norm": 0.15199492871761322, "kl": 0.28899841830134393, "learning_rate": 4.0204485125856465e-06, "loss": 0.0806, "reward": 1.7791666984558105, "reward_std": 0.12981051132082938, "rewards/accuracy_reward": 0.05416666846722364, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7479166746139526, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 556.7479431152344, "epoch": 0.733877420387262, "grad_norm": 0.3340560793876648, "kl": 0.32305283546447755, "learning_rate": 4.0114943000905645e-06, "loss": 0.1183, "reward": 1.7885416984558105, "reward_std": 0.19187554568052292, "rewards/accuracy_reward": 0.08541667014360428, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7364583551883698, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 534.970849609375, "epoch": 0.7341974715954552, "grad_norm": 0.15596982836723328, "kl": 0.5113927971571683, "learning_rate": 4.00254756750727e-06, "loss": 0.0962, "reward": 1.7895833849906921, "reward_std": 0.206942018866539, "rewards/accuracy_reward": 0.09375000502914191, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7375000178813934, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 543.6208465576171, "epoch": 0.7345175228036486, "grad_norm": 0.104178786277771, "kl": 0.285752671957016, "learning_rate": 3.993608326010633e-06, "loss": 0.1134, "reward": 1.718750035762787, "reward_std": 0.24399047940969468, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7354166865348816, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 547.2770935058594, "epoch": 0.7348375740118419, "grad_norm": 0.31328830122947693, "kl": 0.3743242934346199, "learning_rate": 3.984676586766167e-06, "loss": 0.0678, "reward": 1.793750035762787, "reward_std": 0.18041975498199464, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7395833492279053, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 556.2833435058594, "epoch": 0.7351576252200352, "grad_norm": 0.09008507430553436, "kl": 0.17614571936428547, "learning_rate": 3.975752360930015e-06, "loss": 0.0706, "reward": 1.916145884990692, "reward_std": 0.18779027387499808, "rewards/accuracy_reward": 0.21041667070239783, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7411458432674408, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 565.4875122070313, "epoch": 0.7354776764282285, "grad_norm": 0.17964133620262146, "kl": 0.31557943001389505, "learning_rate": 3.9668356596489345e-06, "loss": 0.087, "reward": 1.7958333492279053, "reward_std": 0.22537537813186645, "rewards/accuracy_reward": 0.0979166692122817, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7354166805744171, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 529.4083404541016, "epoch": 0.7357977276364218, "grad_norm": 0.23957663774490356, "kl": 0.15366022884845734, "learning_rate": 3.957926494060285e-06, "loss": 0.0417, "reward": 1.8901042103767396, "reward_std": 0.1613088697195053, "rewards/accuracy_reward": 0.17083333991467953, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7421875178813935, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 550.9021087646485, "epoch": 0.7361177788446152, "grad_norm": 0.14986614882946014, "kl": 0.2438764087855816, "learning_rate": 3.9490248752920116e-06, "loss": 0.0859, "reward": 1.7635417222976684, "reward_std": 0.19552550762891768, "rewards/accuracy_reward": 0.05833333488553762, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7385416805744172, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 563.9354370117187, "epoch": 0.7364378300528085, "grad_norm": 0.07502276450395584, "kl": 0.18566813617944716, "learning_rate": 3.9401308144626375e-06, "loss": 0.038, "reward": 1.8125000476837159, "reward_std": 0.23270709663629532, "rewards/accuracy_reward": 0.09583333693444729, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7416666805744171, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 577.0937744140625, "epoch": 0.7367578812610017, "grad_norm": 0.11627375334501266, "kl": 0.22418252676725386, "learning_rate": 3.931244322681243e-06, "loss": 0.0675, "reward": 1.825520896911621, "reward_std": 0.1442221499979496, "rewards/accuracy_reward": 0.10416666883975267, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7442708373069763, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 548.622933959961, "epoch": 0.7370779324691951, "grad_norm": 0.09706476330757141, "kl": 0.20932482741773129, "learning_rate": 3.922365411047451e-06, "loss": 0.0396, "reward": 1.8552083611488341, "reward_std": 0.12424775958061218, "rewards/accuracy_reward": 0.12916667032986878, "rewards/format_reward": 0.9833333492279053, "rewards/tag_count_reward": 0.7427083492279053, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 571.7833557128906, "epoch": 0.7373979836773884, "grad_norm": 0.1505707949399948, "kl": 0.15522883646190166, "learning_rate": 3.913494090651421e-06, "loss": 0.0709, "reward": 1.7562500357627868, "reward_std": 0.1329023189842701, "rewards/accuracy_reward": 0.031250000558793546, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7479166865348816, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 547.4521057128907, "epoch": 0.7377180348855817, "grad_norm": 0.1870940774679184, "kl": 0.18360717520117759, "learning_rate": 3.90463037257383e-06, "loss": 0.0767, "reward": 1.864583396911621, "reward_std": 0.19524949863553048, "rewards/accuracy_reward": 0.15208333563059567, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7437500178813934, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 547.8229309082031, "epoch": 0.738038086093775, "grad_norm": 0.10056561976671219, "kl": 0.25383531153202055, "learning_rate": 3.8957742678858575e-06, "loss": 0.0886, "reward": 1.696875023841858, "reward_std": 0.1846102386713028, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7364583611488342, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 554.4895935058594, "epoch": 0.7383581373019683, "grad_norm": 0.29490870237350464, "kl": 0.4113804802298546, "learning_rate": 3.8869257876491775e-06, "loss": 0.1208, "reward": 1.8765625715255738, "reward_std": 0.17176087722182273, "rewards/accuracy_reward": 0.17291667275130748, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7432291865348816, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 555.1604309082031, "epoch": 0.7386781885101616, "grad_norm": 0.34679147601127625, "kl": 0.324555953592062, "learning_rate": 3.8780849429159365e-06, "loss": 0.0636, "reward": 1.773437535762787, "reward_std": 0.14504209160804749, "rewards/accuracy_reward": 0.060416667722165585, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7442708551883698, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 568.7812683105469, "epoch": 0.738998239718355, "grad_norm": 0.21677498519420624, "kl": 0.22121107652783395, "learning_rate": 3.869251744728745e-06, "loss": 0.0584, "reward": 1.8776042103767394, "reward_std": 0.12323407009243965, "rewards/accuracy_reward": 0.15833333730697632, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7442708492279053, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 559.7521026611328, "epoch": 0.7393182909265482, "grad_norm": 0.1344781219959259, "kl": 0.2541138086467981, "learning_rate": 3.8604262041206676e-06, "loss": 0.0726, "reward": 1.748958373069763, "reward_std": 0.17084471583366395, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7385416865348816, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 583.4521118164063, "epoch": 0.7396383421347416, "grad_norm": 0.22849154472351074, "kl": 0.26187874004244804, "learning_rate": 3.851608332115192e-06, "loss": 0.055, "reward": 1.7343750238418578, "reward_std": 0.14665495604276657, "rewards/accuracy_reward": 0.022916667722165586, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7447916805744171, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 564.893765258789, "epoch": 0.7399583933429349, "grad_norm": 0.13005004823207855, "kl": 0.2791564010083675, "learning_rate": 3.842798139726239e-06, "loss": 0.0738, "reward": 1.7609375238418579, "reward_std": 0.18678562864661216, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7421875178813935, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 580.3458557128906, "epoch": 0.7402784445511282, "grad_norm": 0.22451624274253845, "kl": 0.35189689993858336, "learning_rate": 3.833995637958134e-06, "loss": 0.118, "reward": 1.6958333492279052, "reward_std": 0.18869037181138992, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7416666924953461, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 561.9021057128906, "epoch": 0.7405984957593215, "grad_norm": 0.24994489550590515, "kl": 0.46310959905385973, "learning_rate": 3.825200837805595e-06, "loss": 0.1139, "reward": 1.7739583849906921, "reward_std": 0.21757967174053192, "rewards/accuracy_reward": 0.08125000316649675, "rewards/format_reward": 0.9562500298023224, "rewards/tag_count_reward": 0.7364583551883698, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 573.2229370117187, "epoch": 0.7409185469675148, "grad_norm": 0.16934210062026978, "kl": 0.5003368586301804, "learning_rate": 3.8164137502537225e-06, "loss": 0.1277, "reward": 1.7718750476837157, "reward_std": 0.23287726268172265, "rewards/accuracy_reward": 0.08541667070239782, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7343750178813935, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 565.0041870117187, "epoch": 0.7412385981757081, "grad_norm": 0.22636856138706207, "kl": 0.46233353689312934, "learning_rate": 3.8076343862779795e-06, "loss": 0.1022, "reward": 1.8239583849906922, "reward_std": 0.18512208759784698, "rewards/accuracy_reward": 0.1250000035390258, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7385416924953461, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 549.0791900634765, "epoch": 0.7415586493839014, "grad_norm": 0.2716544270515442, "kl": 0.45498904660344125, "learning_rate": 3.7988627568441884e-06, "loss": 0.0936, "reward": 1.8187500357627868, "reward_std": 0.16201677322387695, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7395833492279053, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 579.6312683105468, "epoch": 0.7418787005920947, "grad_norm": 0.2425927370786667, "kl": 0.4942165374755859, "learning_rate": 3.7900988729085077e-06, "loss": 0.1107, "reward": 1.7463541865348815, "reward_std": 0.2642780289053917, "rewards/accuracy_reward": 0.06250000149011611, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7317708551883697, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 563.4625183105469, "epoch": 0.742198751800288, "grad_norm": 0.6389156579971313, "kl": 0.6444176331162452, "learning_rate": 3.7813427454174158e-06, "loss": 0.1153, "reward": 1.7890625238418578, "reward_std": 0.25294766649603845, "rewards/accuracy_reward": 0.1020833369344473, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7328125298023224, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 571.5083557128906, "epoch": 0.7425188030084814, "grad_norm": 0.17046241462230682, "kl": 0.42304186820983886, "learning_rate": 3.7725943853077105e-06, "loss": 0.0969, "reward": 1.7640625357627868, "reward_std": 0.21301912367343903, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7390625178813934, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 574.8041809082031, "epoch": 0.7428388542166746, "grad_norm": 0.14227454364299774, "kl": 0.5013582430779934, "learning_rate": 3.7638538035064854e-06, "loss": 0.1181, "reward": 1.7229166984558106, "reward_std": 0.20538915917277337, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.9395833492279053, "rewards/tag_count_reward": 0.743750023841858, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 569.0437652587891, "epoch": 0.743158905424868, "grad_norm": 0.09683094918727875, "kl": 0.28505592197179797, "learning_rate": 3.7551210109311196e-06, "loss": 0.0908, "reward": 1.7859375476837158, "reward_std": 0.20068425834178924, "rewards/accuracy_reward": 0.08750000111758709, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7380208492279052, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 566.5416931152344, "epoch": 0.7434789566330613, "grad_norm": 0.13085141777992249, "kl": 0.24737758412957192, "learning_rate": 3.746396018489261e-06, "loss": 0.0934, "reward": 1.8333333730697632, "reward_std": 0.21514089405536652, "rewards/accuracy_reward": 0.13125000316649676, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7395833611488343, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 555.2125122070313, "epoch": 0.7437990078412546, "grad_norm": 0.11616872996091843, "kl": 0.39013450406491756, "learning_rate": 3.7376788370788164e-06, "loss": 0.0622, "reward": 1.759895884990692, "reward_std": 0.15159039273858071, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.9645833432674408, "rewards/tag_count_reward": 0.7390625, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 547.8666809082031, "epoch": 0.7441190590494479, "grad_norm": 0.12471839785575867, "kl": 0.1726240862160921, "learning_rate": 3.728969477587935e-06, "loss": 0.0673, "reward": 1.8135417103767395, "reward_std": 0.16715479716658593, "rewards/accuracy_reward": 0.09791666939854622, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7427083432674408, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 575.9854431152344, "epoch": 0.7444391102576412, "grad_norm": 0.10027755051851273, "kl": 0.16627274565398692, "learning_rate": 3.7202679508950015e-06, "loss": 0.0805, "reward": 1.7869792103767395, "reward_std": 0.1703805223107338, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7411458492279053, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 564.2875244140625, "epoch": 0.7447591614658345, "grad_norm": 0.08407598733901978, "kl": 0.1928509298712015, "learning_rate": 3.7115742678686053e-06, "loss": 0.1048, "reward": 1.7682291865348816, "reward_std": 0.2047549694776535, "rewards/accuracy_reward": 0.08541667014360428, "rewards/format_reward": 0.9458333432674408, "rewards/tag_count_reward": 0.7369791805744171, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 542.6541778564454, "epoch": 0.7450792126740279, "grad_norm": 0.22395098209381104, "kl": 0.18675260804593563, "learning_rate": 3.7028884393675478e-06, "loss": 0.0446, "reward": 1.847395896911621, "reward_std": 0.16364309713244438, "rewards/accuracy_reward": 0.12916667126119136, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7432291865348816, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 555.3396118164062, "epoch": 0.7453992638822211, "grad_norm": 0.20289666950702667, "kl": 0.16297319643199443, "learning_rate": 3.6942104762408183e-06, "loss": 0.0611, "reward": 1.778125035762787, "reward_std": 0.17890555337071418, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7406250178813935, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 530.1729370117188, "epoch": 0.7457193150904144, "grad_norm": 0.07707800716161728, "kl": 0.21096321307122706, "learning_rate": 3.685540389327583e-06, "loss": 0.0795, "reward": 1.756770873069763, "reward_std": 0.14885179400444032, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7421875119209289, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 566.7291839599609, "epoch": 0.7460393662986078, "grad_norm": 0.0692569687962532, "kl": 0.13143852166831493, "learning_rate": 3.676878189457167e-06, "loss": 0.0346, "reward": 1.8354166865348815, "reward_std": 0.15971220657229424, "rewards/accuracy_reward": 0.11666666772216558, "rewards/format_reward": 0.9750000059604644, "rewards/tag_count_reward": 0.7437500119209289, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 513.533349609375, "epoch": 0.7463594175068011, "grad_norm": 0.08359574526548386, "kl": 0.13114451617002487, "learning_rate": 3.6682238874490463e-06, "loss": 0.0544, "reward": 1.8244791984558106, "reward_std": 0.15698686689138414, "rewards/accuracy_reward": 0.1020833369344473, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.743229192495346, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 549.0458435058594, "epoch": 0.7466794687149944, "grad_norm": 0.14677995443344116, "kl": 0.20154299661517144, "learning_rate": 3.6595774941128315e-06, "loss": 0.057, "reward": 1.7786458492279054, "reward_std": 0.15753973126411439, "rewards/accuracy_reward": 0.05416666697710752, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7473958551883697, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 554.8000244140625, "epoch": 0.7469995199231877, "grad_norm": 0.27684590220451355, "kl": 0.329647683724761, "learning_rate": 3.6509390202482553e-06, "loss": 0.0634, "reward": 1.7859375476837158, "reward_std": 0.2338223472237587, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7380208432674408, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 529.5416870117188, "epoch": 0.747319571131381, "grad_norm": 0.11186019331216812, "kl": 0.2218513660132885, "learning_rate": 3.6423084766451622e-06, "loss": 0.0894, "reward": 1.780208373069763, "reward_std": 0.22022225856781005, "rewards/accuracy_reward": 0.07916666772216559, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250238418579, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 532.0458465576172, "epoch": 0.7476396223395744, "grad_norm": 0.13403449952602386, "kl": 0.15975108332931995, "learning_rate": 3.63368587408348e-06, "loss": 0.0551, "reward": 1.8479166984558106, "reward_std": 0.1556932583451271, "rewards/accuracy_reward": 0.12916666977107524, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7416666865348815, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 548.2354400634765, "epoch": 0.7479596735477676, "grad_norm": 0.08647426962852478, "kl": 0.14235255531966687, "learning_rate": 3.6250712233332297e-06, "loss": 0.055, "reward": 1.7786458730697632, "reward_std": 0.205631835013628, "rewards/accuracy_reward": 0.06041666716337204, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7453125178813934, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 575.6270935058594, "epoch": 0.7482797247559609, "grad_norm": 0.11648111790418625, "kl": 0.24137634374201297, "learning_rate": 3.6164645351544956e-06, "loss": 0.0716, "reward": 1.750000035762787, "reward_std": 0.20163048431277275, "rewards/accuracy_reward": 0.043750002048909664, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7375000178813934, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 567.1875183105469, "epoch": 0.7485997759641543, "grad_norm": 0.1161661297082901, "kl": 0.1347400803118944, "learning_rate": 3.607865820297416e-06, "loss": 0.0517, "reward": 1.7734375238418578, "reward_std": 0.15068410485982894, "rewards/accuracy_reward": 0.05208333563059568, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7442708551883698, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 529.9354370117187, "epoch": 0.7489198271723476, "grad_norm": 0.18194898962974548, "kl": 0.20232655815780162, "learning_rate": 3.5992750895021713e-06, "loss": 0.0326, "reward": 1.7625000119209289, "reward_std": 0.11509302705526352, "rewards/accuracy_reward": 0.029166666977107524, "rewards/format_reward": 0.987500011920929, "rewards/tag_count_reward": 0.7458333373069763, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 555.2583557128906, "epoch": 0.7492398783805408, "grad_norm": 0.1009315699338913, "kl": 0.23906343355774878, "learning_rate": 3.590692353498968e-06, "loss": 0.0879, "reward": 1.7583333611488343, "reward_std": 0.16905978918075562, "rewards/accuracy_reward": 0.04791666902601719, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.743750023841858, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 553.5229278564453, "epoch": 0.7495599295887342, "grad_norm": 0.13877518475055695, "kl": 0.14381254725158216, "learning_rate": 3.58211762300803e-06, "loss": 0.0756, "reward": 1.7395833730697632, "reward_std": 0.1424667552113533, "rewards/accuracy_reward": 0.025000000186264516, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7458333551883698, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 549.8833526611328, "epoch": 0.7498799807969275, "grad_norm": 0.1535481959581375, "kl": 0.211732941493392, "learning_rate": 3.5735509087395815e-06, "loss": 0.0572, "reward": 1.7979166984558106, "reward_std": 0.16340636983513832, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7416666865348815, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 572.5916900634766, "epoch": 0.7502000320051209, "grad_norm": 0.21330109238624573, "kl": 0.37927271127700807, "learning_rate": 3.564992221393825e-06, "loss": 0.0941, "reward": 1.7723958611488342, "reward_std": 0.21374720484018325, "rewards/accuracy_reward": 0.07291666902601719, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7411458492279053, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 521.945849609375, "epoch": 0.7505200832133141, "grad_norm": 0.23470190167427063, "kl": 0.1548861227929592, "learning_rate": 3.556441571660948e-06, "loss": 0.0559, "reward": 1.9057292222976685, "reward_std": 0.1434769354760647, "rewards/accuracy_reward": 0.18333334047347308, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7473958492279053, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 546.0937744140625, "epoch": 0.7508401344215074, "grad_norm": 0.09681239724159241, "kl": 0.17304839566349983, "learning_rate": 3.5478989702210966e-06, "loss": 0.0841, "reward": 1.7692708730697633, "reward_std": 0.13986653685569764, "rewards/accuracy_reward": 0.04791666883975267, "rewards/format_reward": 0.9750000238418579, "rewards/tag_count_reward": 0.7463541865348816, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 546.3562713623047, "epoch": 0.7511601856297008, "grad_norm": 0.11565534770488739, "kl": 0.1687497179955244, "learning_rate": 3.5393644277443596e-06, "loss": 0.0706, "reward": 1.7510416865348817, "reward_std": 0.12845450565218924, "rewards/accuracy_reward": 0.025000000186264516, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.7468750178813934, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 564.5395935058593, "epoch": 0.7514802368378941, "grad_norm": 0.13690054416656494, "kl": 0.25157116502523424, "learning_rate": 3.5308379548907644e-06, "loss": 0.0988, "reward": 1.7223958492279052, "reward_std": 0.23458851501345634, "rewards/accuracy_reward": 0.027083334513008595, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7369791865348816, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 558.202099609375, "epoch": 0.7518002880460873, "grad_norm": 0.3587300777435303, "kl": 0.21654266826808452, "learning_rate": 3.522319562310259e-06, "loss": 0.0701, "reward": 1.7614583611488341, "reward_std": 0.1720878452062607, "rewards/accuracy_reward": 0.0583333358168602, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7427083432674408, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 565.8146026611328, "epoch": 0.7521203392542807, "grad_norm": 0.11443227529525757, "kl": 0.1734992451965809, "learning_rate": 3.513809260642694e-06, "loss": 0.045, "reward": 1.9213542103767396, "reward_std": 0.1500195875763893, "rewards/accuracy_reward": 0.19375000409781934, "rewards/format_reward": 0.9833333492279053, "rewards/tag_count_reward": 0.7442708432674408, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 567.5625244140625, "epoch": 0.752440390462474, "grad_norm": 0.20784099400043488, "kl": 0.27498736456036565, "learning_rate": 3.505307060517823e-06, "loss": 0.0675, "reward": 1.8005208611488341, "reward_std": 0.21704452484846115, "rewards/accuracy_reward": 0.0958333371207118, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7421875178813935, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 576.2479461669922, "epoch": 0.7527604416706674, "grad_norm": 0.3816712498664856, "kl": 0.2782451644539833, "learning_rate": 3.496812972555266e-06, "loss": 0.0863, "reward": 1.7848958730697633, "reward_std": 0.13636522069573404, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7411458551883697, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 528.8937713623047, "epoch": 0.7530804928788606, "grad_norm": 0.20585204660892487, "kl": 0.2115282118320465, "learning_rate": 3.488327007364525e-06, "loss": 0.0555, "reward": 1.7791667103767395, "reward_std": 0.13307306319475173, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7479166805744171, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 573.2354370117188, "epoch": 0.7534005440870539, "grad_norm": 0.11561473459005356, "kl": 0.3394802324473858, "learning_rate": 3.4798491755449483e-06, "loss": 0.1463, "reward": 1.7208333611488342, "reward_std": 0.24762727022171022, "rewards/accuracy_reward": 0.02916666753590107, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7437500178813934, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 573.4229339599609, "epoch": 0.7537205952952473, "grad_norm": 0.19716666638851166, "kl": 0.4750838838517666, "learning_rate": 3.471379487685729e-06, "loss": 0.1206, "reward": 1.7885417103767396, "reward_std": 0.2462085708975792, "rewards/accuracy_reward": 0.09583333600312471, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7406250298023224, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 556.5479400634765, "epoch": 0.7540406465034406, "grad_norm": 0.1148693636059761, "kl": 0.2551156237721443, "learning_rate": 3.4629179543658852e-06, "loss": 0.0645, "reward": 1.8052083730697632, "reward_std": 0.16417960971593856, "rewards/accuracy_reward": 0.09791667070239782, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7406250298023224, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 573.2541870117187, "epoch": 0.7543606977116338, "grad_norm": 0.1864629089832306, "kl": 0.3117987260222435, "learning_rate": 3.4544645861542525e-06, "loss": 0.0914, "reward": 1.7963542103767396, "reward_std": 0.249758792668581, "rewards/accuracy_reward": 0.09375000316649676, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7421875238418579, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 558.941683959961, "epoch": 0.7546807489198272, "grad_norm": 0.18410681188106537, "kl": 0.19241276159882545, "learning_rate": 3.4460193936094644e-06, "loss": 0.0385, "reward": 1.8593750476837159, "reward_std": 0.17560729682445525, "rewards/accuracy_reward": 0.14166666977107525, "rewards/format_reward": 0.9708333611488342, "rewards/tag_count_reward": 0.7468750178813934, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 534.714599609375, "epoch": 0.7550008001280205, "grad_norm": 0.16778114438056946, "kl": 0.36679080240428447, "learning_rate": 3.437582387279946e-06, "loss": 0.1082, "reward": 1.8822917103767396, "reward_std": 0.20298010110855103, "rewards/accuracy_reward": 0.19375000800937414, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7385416805744172, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 553.6500213623046, "epoch": 0.7553208513362137, "grad_norm": 0.2953540086746216, "kl": 0.3251816764473915, "learning_rate": 3.4291535777039e-06, "loss": 0.1073, "reward": 1.7552083730697632, "reward_std": 0.21472963988780974, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7406250178813935, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 582.9479370117188, "epoch": 0.7556409025444071, "grad_norm": 0.2134820818901062, "kl": 0.388304453343153, "learning_rate": 3.4207329754092787e-06, "loss": 0.1032, "reward": 1.807812547683716, "reward_std": 0.2219138652086258, "rewards/accuracy_reward": 0.1104166692122817, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7390625178813934, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 573.189599609375, "epoch": 0.7559609537526004, "grad_norm": 0.14765672385692596, "kl": 0.37944440804421903, "learning_rate": 3.412320590913796e-06, "loss": 0.0673, "reward": 1.783333384990692, "reward_std": 0.17042958214879037, "rewards/accuracy_reward": 0.07500000130385161, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7437500059604645, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 584.214599609375, "epoch": 0.7562810049607938, "grad_norm": 0.18267126381397247, "kl": 0.33554785549640653, "learning_rate": 3.4039164347248953e-06, "loss": 0.0857, "reward": 1.7166666746139527, "reward_std": 0.18649079501628876, "rewards/accuracy_reward": 0.012500000558793545, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7416666865348815, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 573.3229370117188, "epoch": 0.756601056168987, "grad_norm": 0.18736283481121063, "kl": 0.3915620282292366, "learning_rate": 3.3955205173397463e-06, "loss": 0.0988, "reward": 1.7807292222976685, "reward_std": 0.22891742140054702, "rewards/accuracy_reward": 0.09166666958481073, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.743229192495346, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 570.0021057128906, "epoch": 0.7569211073771803, "grad_norm": 0.2990589737892151, "kl": 0.3989905290305614, "learning_rate": 3.387132849245224e-06, "loss": 0.1328, "reward": 1.7213541746139527, "reward_std": 0.2603048712015152, "rewards/accuracy_reward": 0.0354166679084301, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 560.6687622070312, "epoch": 0.7572411585853737, "grad_norm": 0.1770792007446289, "kl": 0.2988043397665024, "learning_rate": 3.378753440917901e-06, "loss": 0.0809, "reward": 1.7817708492279052, "reward_std": 0.1883620299398899, "rewards/accuracy_reward": 0.07708333432674408, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7442708551883698, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 579.7437683105469, "epoch": 0.757561209793567, "grad_norm": 0.24669428169727325, "kl": 0.22290822267532348, "learning_rate": 3.3703823028240355e-06, "loss": 0.0688, "reward": 1.763020884990692, "reward_std": 0.18148678839206694, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7442708492279053, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 563.8812683105468, "epoch": 0.7578812610017602, "grad_norm": 0.2096192091703415, "kl": 0.24256822615861892, "learning_rate": 3.3620194454195565e-06, "loss": 0.0841, "reward": 1.7984375476837158, "reward_std": 0.23278064355254174, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7380208551883698, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 577.6125244140625, "epoch": 0.7582013122099536, "grad_norm": 0.2237611711025238, "kl": 0.38547887057065966, "learning_rate": 3.353664879150039e-06, "loss": 0.1108, "reward": 1.745312547683716, "reward_std": 0.22282838672399521, "rewards/accuracy_reward": 0.054166668094694614, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7390625238418579, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 588.5729309082031, "epoch": 0.7585213634181469, "grad_norm": 0.3807878792285919, "kl": 0.3787414848804474, "learning_rate": 3.3453186144507168e-06, "loss": 0.0842, "reward": 1.8406250596046447, "reward_std": 0.16929481625556947, "rewards/accuracy_reward": 0.1333333384245634, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7427083551883698, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 586.7750091552734, "epoch": 0.7588414146263402, "grad_norm": 0.19828210771083832, "kl": 0.4050968214869499, "learning_rate": 3.336980661746446e-06, "loss": 0.095, "reward": 1.802083384990692, "reward_std": 0.249814622849226, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 577.7395935058594, "epoch": 0.7591614658345335, "grad_norm": 0.18706656992435455, "kl": 0.3668996267020702, "learning_rate": 3.3286510314517027e-06, "loss": 0.0955, "reward": 1.6937500476837157, "reward_std": 0.21575065851211547, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7354166865348816, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 564.1312713623047, "epoch": 0.7594815170427268, "grad_norm": 0.18040499091148376, "kl": 0.3536977834999561, "learning_rate": 3.3203297339705697e-06, "loss": 0.1014, "reward": 1.8427083849906922, "reward_std": 0.20292569175362588, "rewards/accuracy_reward": 0.14791667070239783, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583551883698, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 581.9500183105469, "epoch": 0.7598015682509202, "grad_norm": 0.17289505898952484, "kl": 0.4077607229351997, "learning_rate": 3.3120167796967195e-06, "loss": 0.1037, "reward": 1.7218750238418579, "reward_std": 0.1947345994412899, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7385416805744172, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 574.6645965576172, "epoch": 0.7601216194591135, "grad_norm": 0.4230034053325653, "kl": 0.45233857035636904, "learning_rate": 3.303712179013404e-06, "loss": 0.0924, "reward": 1.7479166984558105, "reward_std": 0.2048986002802849, "rewards/accuracy_reward": 0.05000000223517418, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7395833551883697, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 561.5125152587891, "epoch": 0.7604416706673067, "grad_norm": 0.23644131422042847, "kl": 0.47083998322486875, "learning_rate": 3.295415942293445e-06, "loss": 0.1508, "reward": 1.7208333611488342, "reward_std": 0.30071154832839964, "rewards/accuracy_reward": 0.07083333563059568, "rewards/format_reward": 0.922916692495346, "rewards/tag_count_reward": 0.7270833551883698, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 582.4854370117188, "epoch": 0.7607617218755001, "grad_norm": 0.25301748514175415, "kl": 0.3236188516020775, "learning_rate": 3.2871280798992065e-06, "loss": 0.0775, "reward": 1.8088542103767395, "reward_std": 0.20637039393186568, "rewards/accuracy_reward": 0.10833333600312471, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7401041865348816, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 568.7437683105469, "epoch": 0.7610817730836934, "grad_norm": 0.2608127295970917, "kl": 0.34846524000167844, "learning_rate": 3.278848602182604e-06, "loss": 0.117, "reward": 1.7947917222976684, "reward_std": 0.23584940135478974, "rewards/accuracy_reward": 0.11875000409781933, "rewards/format_reward": 0.9416666924953461, "rewards/tag_count_reward": 0.7343750238418579, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 560.7937713623047, "epoch": 0.7614018242918867, "grad_norm": 0.13144290447235107, "kl": 0.27924820110201837, "learning_rate": 3.2705775194850754e-06, "loss": 0.0652, "reward": 1.8026042103767395, "reward_std": 0.22813262045383453, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.735937523841858, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 591.1583526611328, "epoch": 0.76172187550008, "grad_norm": 0.17167526483535767, "kl": 0.49086830765008926, "learning_rate": 3.262314842137573e-06, "loss": 0.1266, "reward": 1.6869792103767396, "reward_std": 0.2478036791086197, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.9187500238418579, "rewards/tag_count_reward": 0.720312523841858, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 594.5666931152343, "epoch": 0.7620419267082733, "grad_norm": 0.3797181248664856, "kl": 0.4156690865755081, "learning_rate": 3.2540605804605518e-06, "loss": 0.0797, "reward": 1.7578125476837159, "reward_std": 0.2350001037120819, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7390625178813934, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 551.8062744140625, "epoch": 0.7623619779164666, "grad_norm": 0.16799494624137878, "kl": 0.29723322987556455, "learning_rate": 3.245814744763953e-06, "loss": 0.0945, "reward": 1.7552083730697632, "reward_std": 0.22091315314173698, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7385416865348816, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 582.0979431152343, "epoch": 0.76268202912466, "grad_norm": 0.5904622673988342, "kl": 0.3778699226677418, "learning_rate": 3.237577345347196e-06, "loss": 0.1171, "reward": 1.7447916865348816, "reward_std": 0.24800491482019424, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.9458333611488342, "rewards/tag_count_reward": 0.7385416865348816, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 568.0146118164063, "epoch": 0.7630020803328532, "grad_norm": 0.3403340280056, "kl": 0.29922411739826205, "learning_rate": 3.2293483924991632e-06, "loss": 0.0793, "reward": 1.754687535762787, "reward_std": 0.23587894216179847, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7338541865348815, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 557.1354248046875, "epoch": 0.7633221315410466, "grad_norm": 0.13508936762809753, "kl": 0.3111469350755215, "learning_rate": 3.2211278964981794e-06, "loss": 0.0799, "reward": 1.86770840883255, "reward_std": 0.24278006702661514, "rewards/accuracy_reward": 0.17500000540167093, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7364583492279053, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 576.5812744140625, "epoch": 0.7636421827492399, "grad_norm": 0.2847776412963867, "kl": 0.2592492446303368, "learning_rate": 3.2129158676120176e-06, "loss": 0.092, "reward": 1.7619791865348815, "reward_std": 0.1997735843062401, "rewards/accuracy_reward": 0.06250000204890967, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7390625178813934, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 575.4541870117188, "epoch": 0.7639622339574332, "grad_norm": 0.15784919261932373, "kl": 0.3668780118227005, "learning_rate": 3.2047123160978655e-06, "loss": 0.1224, "reward": 1.734375011920929, "reward_std": 0.25276233106851576, "rewards/accuracy_reward": 0.05000000260770321, "rewards/format_reward": 0.9458333611488342, "rewards/tag_count_reward": 0.7385416924953461, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 589.952099609375, "epoch": 0.7642822851656265, "grad_norm": 0.1636374145746231, "kl": 0.45332918018102647, "learning_rate": 3.19651725220233e-06, "loss": 0.1207, "reward": 1.7656250476837159, "reward_std": 0.2817038677632809, "rewards/accuracy_reward": 0.0958333371207118, "rewards/format_reward": 0.9354166865348816, "rewards/tag_count_reward": 0.7343750119209289, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 575.2979370117188, "epoch": 0.7646023363738198, "grad_norm": 0.28287118673324585, "kl": 0.43390736877918246, "learning_rate": 3.1883306861614104e-06, "loss": 0.1094, "reward": 1.7614583849906922, "reward_std": 0.24015092253684997, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7364583551883698, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 583.972933959961, "epoch": 0.7649223875820131, "grad_norm": 0.5688726902008057, "kl": 0.5063559889793396, "learning_rate": 3.180152628200496e-06, "loss": 0.1007, "reward": 1.8213542342185973, "reward_std": 0.29036264047026633, "rewards/accuracy_reward": 0.14375000465661286, "rewards/format_reward": 0.9458333432674408, "rewards/tag_count_reward": 0.7317708611488343, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 589.9750213623047, "epoch": 0.7652424387902065, "grad_norm": 0.4694216251373291, "kl": 0.37770887836813927, "learning_rate": 3.171983088534346e-06, "loss": 0.1105, "reward": 1.7708333492279054, "reward_std": 0.2839522875845432, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.735416692495346, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 581.8562561035156, "epoch": 0.7655624899983997, "grad_norm": 0.20927488803863525, "kl": 0.5734367772936821, "learning_rate": 3.1638220773670825e-06, "loss": 0.1439, "reward": 1.7171875476837157, "reward_std": 0.2636822387576103, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.9395833432674408, "rewards/tag_count_reward": 0.7317708432674408, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 591.2229309082031, "epoch": 0.765882541206593, "grad_norm": 0.3315555453300476, "kl": 0.4113732993602753, "learning_rate": 3.1556696048921764e-06, "loss": 0.0684, "reward": 1.7453125357627868, "reward_std": 0.2074427381157875, "rewards/accuracy_reward": 0.035416666977107526, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7432291865348816, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 574.8875274658203, "epoch": 0.7662025924147864, "grad_norm": 0.4421977400779724, "kl": 0.5268315270543098, "learning_rate": 3.147525681292425e-06, "loss": 0.0601, "reward": 1.7333333611488342, "reward_std": 0.192195063829422, "rewards/accuracy_reward": 0.07500000316649676, "rewards/format_reward": 0.9375000119209289, "rewards/tag_count_reward": 0.7208333432674408, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 549.9854339599609, "epoch": 0.7665226436229797, "grad_norm": 0.21242809295654297, "kl": 0.3715293690562248, "learning_rate": 3.1393903167399553e-06, "loss": 0.0978, "reward": 1.835937535762787, "reward_std": 0.2570257142186165, "rewards/accuracy_reward": 0.1541666707023978, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7338541746139526, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 585.5520935058594, "epoch": 0.766842694831173, "grad_norm": 0.20334164798259735, "kl": 0.5021124824881553, "learning_rate": 3.131263521396204e-06, "loss": 0.1268, "reward": 1.728645884990692, "reward_std": 0.22536734342575074, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.736979192495346, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 582.6146057128906, "epoch": 0.7671627460393663, "grad_norm": 0.22740711271762848, "kl": 0.6462334305047989, "learning_rate": 3.123145305411902e-06, "loss": 0.1419, "reward": 1.7661458611488343, "reward_std": 0.270243152230978, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7328125178813935, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 565.4958557128906, "epoch": 0.7674827972475596, "grad_norm": 0.11030031740665436, "kl": 0.19400645047426224, "learning_rate": 3.115035678927063e-06, "loss": 0.0823, "reward": 1.8479167222976685, "reward_std": 0.24711870402097702, "rewards/accuracy_reward": 0.14791666772216558, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7375000238418579, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 571.727099609375, "epoch": 0.767802848455753, "grad_norm": 0.31179970502853394, "kl": 0.3838733732700348, "learning_rate": 3.106934652070975e-06, "loss": 0.1031, "reward": 1.8020833611488343, "reward_std": 0.1734007865190506, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7395833492279053, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 569.8354339599609, "epoch": 0.7681228996639462, "grad_norm": 0.21114316582679749, "kl": 0.2608781367540359, "learning_rate": 3.098842234962183e-06, "loss": 0.0605, "reward": 1.8239583730697633, "reward_std": 0.19166183918714524, "rewards/accuracy_reward": 0.11666667088866234, "rewards/format_reward": 0.9625000298023224, "rewards/tag_count_reward": 0.744791692495346, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 564.9812652587891, "epoch": 0.7684429508721395, "grad_norm": 0.15161120891571045, "kl": 0.43533697724342346, "learning_rate": 3.090758437708482e-06, "loss": 0.0964, "reward": 1.715625023841858, "reward_std": 0.18822606652975082, "rewards/accuracy_reward": 0.018750000186264514, "rewards/format_reward": 0.9583333432674408, "rewards/tag_count_reward": 0.7385416865348816, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 580.7396118164063, "epoch": 0.7687630020803329, "grad_norm": 0.1593593806028366, "kl": 0.32452878206968305, "learning_rate": 3.08268327040689e-06, "loss": 0.1009, "reward": 1.7885417222976685, "reward_std": 0.22790974006056786, "rewards/accuracy_reward": 0.09583333600312471, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7364583492279053, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 572.2666809082032, "epoch": 0.7690830532885262, "grad_norm": 0.1515556126832962, "kl": 0.3837066598236561, "learning_rate": 3.0746167431436547e-06, "loss": 0.083, "reward": 1.7276041984558106, "reward_std": 0.24149103313684464, "rewards/accuracy_reward": 0.05000000204890966, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7317708492279053, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 549.2416839599609, "epoch": 0.7694031044967194, "grad_norm": 0.1699807196855545, "kl": 0.23230726271867752, "learning_rate": 3.0665588659942314e-06, "loss": 0.1027, "reward": 1.8213542103767395, "reward_std": 0.22609889209270478, "rewards/accuracy_reward": 0.12083333563059569, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7401041805744171, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 558.9437683105468, "epoch": 0.7697231557049128, "grad_norm": 0.12499313056468964, "kl": 0.28099107556045055, "learning_rate": 3.058509649023269e-06, "loss": 0.1095, "reward": 1.729687547683716, "reward_std": 0.18809969127178192, "rewards/accuracy_reward": 0.02500000111758709, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7401041865348816, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 580.3875183105469, "epoch": 0.7700432069131061, "grad_norm": 0.1266588419675827, "kl": 0.16458383351564407, "learning_rate": 3.050469102284601e-06, "loss": 0.0639, "reward": 1.7901042222976684, "reward_std": 0.14033205881714822, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7442708551883698, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 592.993783569336, "epoch": 0.7703632581212994, "grad_norm": 0.1295279860496521, "kl": 0.37859131768345833, "learning_rate": 3.0424372358212285e-06, "loss": 0.0713, "reward": 1.7776041865348815, "reward_std": 0.22646936923265457, "rewards/accuracy_reward": 0.0854166692122817, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7359375178813934, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 584.9500122070312, "epoch": 0.7706833093294927, "grad_norm": 0.18353892862796783, "kl": 0.2817696675658226, "learning_rate": 3.0344140596653126e-06, "loss": 0.0866, "reward": 1.7906250476837158, "reward_std": 0.19676049128174783, "rewards/accuracy_reward": 0.08333333488553762, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7385416746139526, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 579.4458557128906, "epoch": 0.771003360537686, "grad_norm": 0.13988955318927765, "kl": 0.22601069658994674, "learning_rate": 3.026399583838163e-06, "loss": 0.0425, "reward": 1.8536458969116212, "reward_std": 0.1386600524187088, "rewards/accuracy_reward": 0.1312500022351742, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.745312511920929, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 570.1979309082031, "epoch": 0.7713234117458794, "grad_norm": 0.15479102730751038, "kl": 0.46245444044470785, "learning_rate": 3.0183938183502147e-06, "loss": 0.0515, "reward": 1.8177083730697632, "reward_std": 0.22176536172628403, "rewards/accuracy_reward": 0.11458333861082792, "rewards/format_reward": 0.9625000298023224, "rewards/tag_count_reward": 0.7406250238418579, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 570.7250061035156, "epoch": 0.7716434629540726, "grad_norm": 0.15402555465698242, "kl": 0.29928482323884964, "learning_rate": 3.0103967732010277e-06, "loss": 0.1084, "reward": 1.7593750476837158, "reward_std": 0.1930878482758999, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7406250178813935, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 587.2312744140625, "epoch": 0.7719635141622659, "grad_norm": 0.10418938845396042, "kl": 0.3145505003631115, "learning_rate": 3.0024084583792702e-06, "loss": 0.0554, "reward": 1.7895833730697632, "reward_std": 0.19940297231078147, "rewards/accuracy_reward": 0.0812500050291419, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7416666924953461, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 572.2437683105469, "epoch": 0.7722835653704593, "grad_norm": 0.1994236707687378, "kl": 0.19253274872899057, "learning_rate": 2.9944288838627055e-06, "loss": 0.051, "reward": 1.7958333492279053, "reward_std": 0.15394851118326186, "rewards/accuracy_reward": 0.07916666902601718, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7416666746139526, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 571.1271057128906, "epoch": 0.7726036165786526, "grad_norm": 0.17601707577705383, "kl": 0.3171676769852638, "learning_rate": 2.986458059618179e-06, "loss": 0.0766, "reward": 1.6947917103767396, "reward_std": 0.240276700258255, "rewards/accuracy_reward": 0.01875000074505806, "rewards/format_reward": 0.9395833551883698, "rewards/tag_count_reward": 0.7364583551883698, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 572.7958557128907, "epoch": 0.7729236677868458, "grad_norm": 0.09672868251800537, "kl": 0.23676921837031842, "learning_rate": 2.978495995601608e-06, "loss": 0.0733, "reward": 1.7895833611488343, "reward_std": 0.15342155173420907, "rewards/accuracy_reward": 0.07708333730697632, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7395833492279053, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 541.4625213623046, "epoch": 0.7732437189950392, "grad_norm": 0.10931958258152008, "kl": 0.2647506821900606, "learning_rate": 2.970542701757967e-06, "loss": 0.0457, "reward": 1.8171875476837158, "reward_std": 0.15667049512267112, "rewards/accuracy_reward": 0.09791667014360428, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7421875238418579, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 578.3500152587891, "epoch": 0.7735637702032325, "grad_norm": 0.12150561064481735, "kl": 0.19132925122976302, "learning_rate": 2.962598188021275e-06, "loss": 0.0552, "reward": 1.7864583730697632, "reward_std": 0.13132432252168655, "rewards/accuracy_reward": 0.06041666977107525, "rewards/format_reward": 0.9812500178813934, "rewards/tag_count_reward": 0.7447916865348816, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 583.8458557128906, "epoch": 0.7738838214114259, "grad_norm": 0.13992400467395782, "kl": 0.14227314628660678, "learning_rate": 2.9546624643145894e-06, "loss": 0.0547, "reward": 1.7958333611488342, "reward_std": 0.11764216274023057, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7458333551883698, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 557.5791870117188, "epoch": 0.7742038726196191, "grad_norm": 0.14344719052314758, "kl": 0.23957008644938468, "learning_rate": 2.9467355405499788e-06, "loss": 0.0858, "reward": 1.754687535762787, "reward_std": 0.2000661239027977, "rewards/accuracy_reward": 0.05000000223517418, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7380208492279052, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 536.9083435058594, "epoch": 0.7745239238278124, "grad_norm": 0.05815388634800911, "kl": 0.17854432128369807, "learning_rate": 2.9388174266285273e-06, "loss": 0.0459, "reward": 1.8260417103767395, "reward_std": 0.16736711636185647, "rewards/accuracy_reward": 0.10416666865348816, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7427083492279053, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 573.0437744140625, "epoch": 0.7748439750360058, "grad_norm": 0.053522102534770966, "kl": 0.15169395208358766, "learning_rate": 2.9309081324403153e-06, "loss": 0.0783, "reward": 1.8119791984558105, "reward_std": 0.15475299432873726, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.745312511920929, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 577.0166870117188, "epoch": 0.7751640262441991, "grad_norm": 0.3917308449745178, "kl": 0.21865907534956933, "learning_rate": 2.923007667864405e-06, "loss": 0.0946, "reward": 1.736458384990692, "reward_std": 0.21290394365787507, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9604166984558106, "rewards/tag_count_reward": 0.7427083611488342, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 549.1479339599609, "epoch": 0.7754840774523923, "grad_norm": 0.15909360349178314, "kl": 0.27562965378165244, "learning_rate": 2.9151160427688296e-06, "loss": 0.0639, "reward": 1.7421875476837159, "reward_std": 0.18707589358091353, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7359375178813934, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 561.3541778564453, "epoch": 0.7758041286605857, "grad_norm": 0.08536235988140106, "kl": 0.14693028368055822, "learning_rate": 2.907233267010584e-06, "loss": 0.0413, "reward": 1.7802083611488342, "reward_std": 0.10094428583979606, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.9791666746139527, "rewards/tag_count_reward": 0.746875011920929, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 578.0291870117187, "epoch": 0.776124179868779, "grad_norm": 0.12931646406650543, "kl": 0.23414733335375787, "learning_rate": 2.8993593504356065e-06, "loss": 0.051, "reward": 1.8479167222976685, "reward_std": 0.16570336520671844, "rewards/accuracy_reward": 0.13750000428408385, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 575.4229431152344, "epoch": 0.7764442310769724, "grad_norm": 0.10547000914812088, "kl": 0.25148463547229766, "learning_rate": 2.8914943028787756e-06, "loss": 0.0588, "reward": 1.707812535762787, "reward_std": 0.18785856291651726, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7411458492279053, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 578.931265258789, "epoch": 0.7767642822851656, "grad_norm": 0.15780065953731537, "kl": 0.23954942002892493, "learning_rate": 2.883638134163882e-06, "loss": 0.0896, "reward": 1.7562500357627868, "reward_std": 0.17177270203828812, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7458333551883698, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 551.5020965576172, "epoch": 0.7770843334933589, "grad_norm": 0.240781769156456, "kl": 0.25432484969496727, "learning_rate": 2.8757908541036338e-06, "loss": 0.0615, "reward": 1.8828125596046448, "reward_std": 0.15776183307170868, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7473958492279053, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 571.5708557128906, "epoch": 0.7774043847015523, "grad_norm": 0.3382117450237274, "kl": 0.3822079569101334, "learning_rate": 2.8679524724996354e-06, "loss": 0.1107, "reward": 1.7885417103767396, "reward_std": 0.2685310781002045, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7364583551883698, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 593.7562744140625, "epoch": 0.7777244359097456, "grad_norm": 0.2815842926502228, "kl": 0.20215894728899003, "learning_rate": 2.8601229991423787e-06, "loss": 0.0719, "reward": 1.7885417222976685, "reward_std": 0.18276797756552696, "rewards/accuracy_reward": 0.08125000223517417, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7427083551883698, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 562.8583557128907, "epoch": 0.7780444871179388, "grad_norm": 0.2211749106645584, "kl": 0.2775234118103981, "learning_rate": 2.8523024438112236e-06, "loss": 0.1116, "reward": 1.8385417222976685, "reward_std": 0.18420382663607598, "rewards/accuracy_reward": 0.13125000353902577, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7385416865348816, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 540.7041748046875, "epoch": 0.7783645383261322, "grad_norm": 0.15589836239814758, "kl": 0.3906994819641113, "learning_rate": 2.8444908162743957e-06, "loss": 0.0551, "reward": 1.8416666865348816, "reward_std": 0.1611462078988552, "rewards/accuracy_reward": 0.13125000447034835, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7458333373069763, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 561.9520965576172, "epoch": 0.7786845895343255, "grad_norm": 0.15969502925872803, "kl": 0.29722789898514745, "learning_rate": 2.836688126288968e-06, "loss": 0.0927, "reward": 1.8218750357627869, "reward_std": 0.17098113447427749, "rewards/accuracy_reward": 0.1166666716337204, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7447916805744171, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 584.7354370117188, "epoch": 0.7790046407425189, "grad_norm": 0.2324097454547882, "kl": 0.5922939941287041, "learning_rate": 2.828894383600851e-06, "loss": 0.1457, "reward": 1.8229167222976685, "reward_std": 0.2708844006061554, "rewards/accuracy_reward": 0.12916667126119136, "rewards/format_reward": 0.9541666984558106, "rewards/tag_count_reward": 0.7395833671092987, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 574.7291870117188, "epoch": 0.7793246919507121, "grad_norm": 0.1611299216747284, "kl": 0.2276984043419361, "learning_rate": 2.8211095979447733e-06, "loss": 0.0911, "reward": 1.817187535762787, "reward_std": 0.18277856037020684, "rewards/accuracy_reward": 0.10833333693444729, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7421875178813935, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 564.2291839599609, "epoch": 0.7796447431589054, "grad_norm": 0.281716912984848, "kl": 0.5017144948244094, "learning_rate": 2.8133337790442838e-06, "loss": 0.1155, "reward": 1.7838541984558105, "reward_std": 0.21507382094860078, "rewards/accuracy_reward": 0.09166666828095912, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7380208492279052, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 560.5104370117188, "epoch": 0.7799647943670988, "grad_norm": 0.2546637952327728, "kl": 0.32052686661481855, "learning_rate": 2.805566936611728e-06, "loss": 0.1056, "reward": 1.765625035762787, "reward_std": 0.213297700881958, "rewards/accuracy_reward": 0.06666666772216559, "rewards/format_reward": 0.9562500298023224, "rewards/tag_count_reward": 0.7427083432674408, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 578.7896057128906, "epoch": 0.7802848455752921, "grad_norm": 0.2136593610048294, "kl": 0.37920288145542147, "learning_rate": 2.7978090803482407e-06, "loss": 0.0881, "reward": 1.742187535762787, "reward_std": 0.22298106253147126, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7401041865348816, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 597.2854370117187, "epoch": 0.7806048967834853, "grad_norm": 0.2064886838197708, "kl": 0.3057727158069611, "learning_rate": 2.790060219943731e-06, "loss": 0.0851, "reward": 1.7432291984558106, "reward_std": 0.23816211745142937, "rewards/accuracy_reward": 0.05208333507180214, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.736979192495346, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 555.9708618164062, "epoch": 0.7809249479916787, "grad_norm": 0.24506452679634094, "kl": 0.35978928543627264, "learning_rate": 2.782320365076874e-06, "loss": 0.0667, "reward": 1.814062523841858, "reward_std": 0.17409721985459328, "rewards/accuracy_reward": 0.10208333358168602, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7432291746139527, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 578.9250183105469, "epoch": 0.781244999199872, "grad_norm": 0.47220608592033386, "kl": 0.5047030732035637, "learning_rate": 2.7745895254150924e-06, "loss": 0.1056, "reward": 1.856250035762787, "reward_std": 0.2518445745110512, "rewards/accuracy_reward": 0.17083333544433116, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7354166746139527, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 608.6875244140625, "epoch": 0.7815650504080653, "grad_norm": 0.2580389082431793, "kl": 0.48905070424079894, "learning_rate": 2.766867710614557e-06, "loss": 0.1298, "reward": 1.7447916865348816, "reward_std": 0.328003853559494, "rewards/accuracy_reward": 0.08333333618938923, "rewards/format_reward": 0.9354166865348816, "rewards/tag_count_reward": 0.7260416865348815, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 553.9521118164063, "epoch": 0.7818851016162586, "grad_norm": 0.1950906366109848, "kl": 0.34237351566553115, "learning_rate": 2.7591549303201513e-06, "loss": 0.1036, "reward": 1.818750023841858, "reward_std": 0.24801153615117072, "rewards/accuracy_reward": 0.12708333767950536, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7333333551883697, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 581.5562683105469, "epoch": 0.7822051528244519, "grad_norm": 0.20304733514785767, "kl": 0.30522238165140153, "learning_rate": 2.75145119416549e-06, "loss": 0.1117, "reward": 1.7088542103767395, "reward_std": 0.23408942371606828, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7401041924953461, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 557.1750244140625, "epoch": 0.7825252040326452, "grad_norm": 0.17914730310440063, "kl": 0.4073436066508293, "learning_rate": 2.7437565117728805e-06, "loss": 0.1271, "reward": 1.7020833611488342, "reward_std": 0.22345729991793634, "rewards/accuracy_reward": 0.01875, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7333333551883697, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 520.3812622070312, "epoch": 0.7828452552408386, "grad_norm": 0.42733335494995117, "kl": 0.38346418291330336, "learning_rate": 2.7360708927533285e-06, "loss": 0.1283, "reward": 1.8026042103767395, "reward_std": 0.2687413990497589, "rewards/accuracy_reward": 0.11458333656191826, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7380208492279052, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 570.4437805175781, "epoch": 0.7831653064490318, "grad_norm": 0.13012735545635223, "kl": 0.2576710045337677, "learning_rate": 2.7283943467065153e-06, "loss": 0.0592, "reward": 1.8531250715255738, "reward_std": 0.17226756662130355, "rewards/accuracy_reward": 0.14583333879709243, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7427083492279053, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 581.4604370117188, "epoch": 0.7834853576572252, "grad_norm": 0.13805796205997467, "kl": 0.33923321291804315, "learning_rate": 2.7207268832207913e-06, "loss": 0.0902, "reward": 1.717187523841858, "reward_std": 0.1830857887864113, "rewards/accuracy_reward": 0.012500000186264515, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7401041924953461, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 559.3437652587891, "epoch": 0.7838054088654185, "grad_norm": 0.18494828045368195, "kl": 0.2963532693684101, "learning_rate": 2.7130685118731615e-06, "loss": 0.0969, "reward": 1.7625000476837158, "reward_std": 0.2770323887467384, "rewards/accuracy_reward": 0.07083333507180214, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7375000178813934, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 547.452099609375, "epoch": 0.7841254600736117, "grad_norm": 0.12305200845003128, "kl": 0.35416630394756793, "learning_rate": 2.7054192422292737e-06, "loss": 0.0769, "reward": 1.8255208730697632, "reward_std": 0.22889174297451972, "rewards/accuracy_reward": 0.13750000316649674, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7359375178813934, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 568.2500183105469, "epoch": 0.7844455112818051, "grad_norm": 0.13378387689590454, "kl": 0.24118001461029054, "learning_rate": 2.6977790838434126e-06, "loss": 0.0881, "reward": 1.8208333849906921, "reward_std": 0.19077629521489142, "rewards/accuracy_reward": 0.12291667070239783, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7375000178813934, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 589.7333465576172, "epoch": 0.7847655624899984, "grad_norm": 0.14180435240268707, "kl": 0.2530834019184113, "learning_rate": 2.6901480462584707e-06, "loss": 0.093, "reward": 1.7291666984558105, "reward_std": 0.23850278556346893, "rewards/accuracy_reward": 0.037500002235174176, "rewards/format_reward": 0.9541666984558106, "rewards/tag_count_reward": 0.7375000238418579, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 590.8187683105468, "epoch": 0.7850856136981917, "grad_norm": 0.14379824697971344, "kl": 0.2346596010029316, "learning_rate": 2.68252613900596e-06, "loss": 0.0673, "reward": 1.7880208611488342, "reward_std": 0.18742336928844452, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7359375178813934, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 605.120849609375, "epoch": 0.785405664906385, "grad_norm": 0.2423066347837448, "kl": 0.5001774221658707, "learning_rate": 2.674913371605984e-06, "loss": 0.0934, "reward": 1.7817708611488343, "reward_std": 0.24941302090883255, "rewards/accuracy_reward": 0.09583333432674408, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7380208492279052, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 565.8250122070312, "epoch": 0.7857257161145783, "grad_norm": 0.10441887378692627, "kl": 0.2186158448457718, "learning_rate": 2.6673097535672287e-06, "loss": 0.0599, "reward": 1.8078125357627868, "reward_std": 0.18876262679696082, "rewards/accuracy_reward": 0.08958333656191826, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7473958492279053, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 587.7771026611329, "epoch": 0.7860457673227716, "grad_norm": 0.1312214881181717, "kl": 0.22390735670924186, "learning_rate": 2.6597152943869542e-06, "loss": 0.0847, "reward": 1.7880208492279053, "reward_std": 0.19233649373054504, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7401041805744171, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 591.6646118164062, "epoch": 0.786365818530965, "grad_norm": 0.22349661588668823, "kl": 0.32182138562202456, "learning_rate": 2.652130003550981e-06, "loss": 0.0887, "reward": 1.7354166984558106, "reward_std": 0.21236109361052513, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7416666805744171, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 568.320849609375, "epoch": 0.7866858697391582, "grad_norm": 0.11371547728776932, "kl": 0.2892355978488922, "learning_rate": 2.6445538905336764e-06, "loss": 0.1003, "reward": 1.8052083730697632, "reward_std": 0.2052166000008583, "rewards/accuracy_reward": 0.11041667181998491, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7385416865348816, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 535.4708435058594, "epoch": 0.7870059209473516, "grad_norm": 0.12026123702526093, "kl": 0.38724498003721236, "learning_rate": 2.6369869647979474e-06, "loss": 0.0708, "reward": 1.7208333611488342, "reward_std": 0.2167038567364216, "rewards/accuracy_reward": 0.02500000111758709, "rewards/format_reward": 0.9562500298023224, "rewards/tag_count_reward": 0.7395833551883697, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 572.3375183105469, "epoch": 0.7873259721555449, "grad_norm": 0.23202356696128845, "kl": 0.3304444134235382, "learning_rate": 2.6294292357952166e-06, "loss": 0.0875, "reward": 1.7151041984558106, "reward_std": 0.20461869090795518, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7401041805744171, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 579.9562591552734, "epoch": 0.7876460233637382, "grad_norm": 0.1585400104522705, "kl": 0.35580268651247027, "learning_rate": 2.621880712965431e-06, "loss": 0.1243, "reward": 1.7515625476837158, "reward_std": 0.2352686658501625, "rewards/accuracy_reward": 0.05833333563059569, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7390625178813934, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 574.7437744140625, "epoch": 0.7879660745719315, "grad_norm": 0.39485612511634827, "kl": 0.21709256619215012, "learning_rate": 2.614341405737032e-06, "loss": 0.0933, "reward": 1.7697917222976685, "reward_std": 0.24684084132313727, "rewards/accuracy_reward": 0.08125000316649675, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7385416924953461, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 562.4083557128906, "epoch": 0.7882861257801248, "grad_norm": 0.1640438437461853, "kl": 0.2398224614560604, "learning_rate": 2.606811323526952e-06, "loss": 0.0811, "reward": 1.8218750476837158, "reward_std": 0.20028617680072786, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7427083492279053, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 567.8500122070312, "epoch": 0.7886061769883181, "grad_norm": 0.12999562919139862, "kl": 0.2820978585630655, "learning_rate": 2.5992904757406025e-06, "loss": 0.0903, "reward": 1.7838542103767394, "reward_std": 0.212898188829422, "rewards/accuracy_reward": 0.08125000335276127, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7421875238418579, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 583.145849609375, "epoch": 0.7889262281965115, "grad_norm": 0.26787373423576355, "kl": 0.35498605370521547, "learning_rate": 2.5917788717718563e-06, "loss": 0.0853, "reward": 1.7744791984558106, "reward_std": 0.20152606964111328, "rewards/accuracy_reward": 0.07083333637565374, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7432291865348816, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 553.4375244140625, "epoch": 0.7892462794047047, "grad_norm": 0.2707747519016266, "kl": 0.21967264786362647, "learning_rate": 2.584276521003046e-06, "loss": 0.1241, "reward": 1.775000023841858, "reward_std": 0.25089699029922485, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7354166865348816, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 585.7021057128907, "epoch": 0.789566330612898, "grad_norm": 0.2889306843280792, "kl": 0.36077115684747696, "learning_rate": 2.5767834328049444e-06, "loss": 0.1118, "reward": 1.7328125357627868, "reward_std": 0.23334373384714127, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7328125238418579, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 565.4750183105468, "epoch": 0.7898863818210914, "grad_norm": 0.12038398534059525, "kl": 0.3180027477443218, "learning_rate": 2.56929961653675e-06, "loss": 0.0808, "reward": 1.7505208730697632, "reward_std": 0.20414262339472772, "rewards/accuracy_reward": 0.05416666921228171, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7380208492279052, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 575.8916809082032, "epoch": 0.7902064330292847, "grad_norm": 0.17740662395954132, "kl": 0.6491322204470634, "learning_rate": 2.5618250815460864e-06, "loss": 0.1696, "reward": 1.7906250357627869, "reward_std": 0.3307073026895523, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.9291666865348815, "rewards/tag_count_reward": 0.7260416924953461, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 568.695849609375, "epoch": 0.790526484237478, "grad_norm": 0.19464203715324402, "kl": 0.40145623236894606, "learning_rate": 2.5543598371689826e-06, "loss": 0.1584, "reward": 1.8687500596046447, "reward_std": 0.2773083925247192, "rewards/accuracy_reward": 0.20000000596046447, "rewards/format_reward": 0.9354166865348816, "rewards/tag_count_reward": 0.7333333551883697, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 581.8041870117188, "epoch": 0.7908465354456713, "grad_norm": 0.41462260484695435, "kl": 0.38077380508184433, "learning_rate": 2.546903892729864e-06, "loss": 0.0834, "reward": 1.7333333849906922, "reward_std": 0.19426627084612846, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.735416692495346, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 577.802099609375, "epoch": 0.7911665866538646, "grad_norm": 0.31547677516937256, "kl": 0.5052529156208039, "learning_rate": 2.539457257541539e-06, "loss": 0.1091, "reward": 1.7666667103767395, "reward_std": 0.2610872372984886, "rewards/accuracy_reward": 0.09791666772216559, "rewards/format_reward": 0.9375000238418579, "rewards/tag_count_reward": 0.7312500238418579, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 592.7895935058593, "epoch": 0.791486637862058, "grad_norm": 0.23230285942554474, "kl": 0.46921139508485793, "learning_rate": 2.532019940905186e-06, "loss": 0.1449, "reward": 1.784895920753479, "reward_std": 0.2417847713455558, "rewards/accuracy_reward": 0.11250000409781932, "rewards/format_reward": 0.9395833492279053, "rewards/tag_count_reward": 0.7328125238418579, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 601.7812744140625, "epoch": 0.7918066890702512, "grad_norm": 0.20156913995742798, "kl": 0.5675974369049073, "learning_rate": 2.524591952110349e-06, "loss": 0.1396, "reward": 1.6666667103767394, "reward_std": 0.308664807677269, "rewards/accuracy_reward": 0.01875000037252903, "rewards/format_reward": 0.9208333551883697, "rewards/tag_count_reward": 0.7270833551883698, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 557.689599609375, "epoch": 0.7921267402784445, "grad_norm": 0.19317319989204407, "kl": 0.3741036131978035, "learning_rate": 2.5171733004349187e-06, "loss": 0.096, "reward": 1.842708373069763, "reward_std": 0.1966053232550621, "rewards/accuracy_reward": 0.14166667070239783, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250238418579, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 574.2812683105469, "epoch": 0.7924467914866379, "grad_norm": 0.22295664250850677, "kl": 0.5214636474847794, "learning_rate": 2.5097639951451247e-06, "loss": 0.0917, "reward": 1.7364583611488342, "reward_std": 0.20884488373994828, "rewards/accuracy_reward": 0.05416666883975267, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7322916865348816, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 557.0750183105469, "epoch": 0.7927668426948312, "grad_norm": 0.17876215279102325, "kl": 0.4285578727722168, "learning_rate": 2.5023640454955167e-06, "loss": 0.1435, "reward": 1.7416667222976685, "reward_std": 0.27694963067770006, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.9270833492279053, "rewards/tag_count_reward": 0.731250011920929, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 565.6395935058594, "epoch": 0.7930868939030244, "grad_norm": 0.3092890679836273, "kl": 0.3600159421563148, "learning_rate": 2.4949734607289656e-06, "loss": 0.1012, "reward": 1.7359375357627869, "reward_std": 0.20121132731437683, "rewards/accuracy_reward": 0.041666668653488156, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7442708551883698, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 550.558349609375, "epoch": 0.7934069451112178, "grad_norm": 0.18029294908046722, "kl": 0.42183038890361785, "learning_rate": 2.4875922500766414e-06, "loss": 0.0984, "reward": 1.7750000357627869, "reward_std": 0.2342265397310257, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.9625000298023224, "rewards/tag_count_reward": 0.7395833551883697, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 557.2500244140625, "epoch": 0.7937269963194111, "grad_norm": 0.27822908759117126, "kl": 0.41971677392721174, "learning_rate": 2.4802204227580095e-06, "loss": 0.131, "reward": 1.757812511920929, "reward_std": 0.23821898847818374, "rewards/accuracy_reward": 0.07291666828095913, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7328125238418579, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 564.3062683105469, "epoch": 0.7940470475276045, "grad_norm": 0.3728947043418884, "kl": 0.2817161396145821, "learning_rate": 2.472857987980809e-06, "loss": 0.1114, "reward": 1.7854166984558106, "reward_std": 0.26729344129562377, "rewards/accuracy_reward": 0.09166667107492685, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.735416692495346, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 545.3041961669921, "epoch": 0.7943670987357977, "grad_norm": 0.1849849373102188, "kl": 0.2868703156709671, "learning_rate": 2.4655049549410535e-06, "loss": 0.0632, "reward": 1.750000035762787, "reward_std": 0.19100956693291665, "rewards/accuracy_reward": 0.05208333395421505, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7375000238418579, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 564.050015258789, "epoch": 0.794687149943991, "grad_norm": 0.15527111291885376, "kl": 0.4221248269081116, "learning_rate": 2.4581613328230093e-06, "loss": 0.1289, "reward": 1.7213541984558105, "reward_std": 0.2918809249997139, "rewards/accuracy_reward": 0.05208333376795053, "rewards/format_reward": 0.9333333492279052, "rewards/tag_count_reward": 0.7359375178813934, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 572.483349609375, "epoch": 0.7950072011521844, "grad_norm": 0.2829282581806183, "kl": 0.4145924270153046, "learning_rate": 2.450827130799193e-06, "loss": 0.0803, "reward": 1.789062535762787, "reward_std": 0.18743065968155861, "rewards/accuracy_reward": 0.07500000204890966, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7411458551883697, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 552.9354309082031, "epoch": 0.7953272523603777, "grad_norm": 0.1710597723722458, "kl": 0.441159937530756, "learning_rate": 2.443502358030344e-06, "loss": 0.089, "reward": 1.762500023841858, "reward_std": 0.24500463232398034, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7375000178813934, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 552.5791778564453, "epoch": 0.7956473035685709, "grad_norm": 0.17100457847118378, "kl": 0.27078391164541243, "learning_rate": 2.436187023665435e-06, "loss": 0.0777, "reward": 1.842708373069763, "reward_std": 0.2264217108488083, "rewards/accuracy_reward": 0.14375000353902578, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7343750178813935, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 579.4645935058594, "epoch": 0.7959673547767643, "grad_norm": 0.28210291266441345, "kl": 0.27822469994425775, "learning_rate": 2.4288811368416466e-06, "loss": 0.1004, "reward": 1.795312523841858, "reward_std": 0.20817887112498284, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7369791805744171, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 568.908349609375, "epoch": 0.7962874059849576, "grad_norm": 0.12771768867969513, "kl": 0.298288094997406, "learning_rate": 2.421584706684359e-06, "loss": 0.0958, "reward": 1.7255208849906922, "reward_std": 0.18443159461021424, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7401041865348816, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 567.8416900634766, "epoch": 0.796607457193151, "grad_norm": 0.1272592395544052, "kl": 0.22488604262471198, "learning_rate": 2.4142977423071388e-06, "loss": 0.074, "reward": 1.8083333492279052, "reward_std": 0.17383792996406555, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7437500178813934, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 570.5104431152344, "epoch": 0.7969275084013442, "grad_norm": 0.21532893180847168, "kl": 0.23192031309008598, "learning_rate": 2.4070202528117326e-06, "loss": 0.0543, "reward": 1.7500000476837159, "reward_std": 0.15162527337670326, "rewards/accuracy_reward": 0.037500002235174176, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7437500178813934, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 539.958349609375, "epoch": 0.7972475596095375, "grad_norm": 0.12722332775592804, "kl": 0.274930589646101, "learning_rate": 2.3997522472880496e-06, "loss": 0.1231, "reward": 1.752083384990692, "reward_std": 0.20301416665315627, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7437500178813934, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 576.3729309082031, "epoch": 0.7975676108177309, "grad_norm": 0.1642562448978424, "kl": 0.2974217519164085, "learning_rate": 2.3924937348141574e-06, "loss": 0.0781, "reward": 1.7375000238418579, "reward_std": 0.13625881671905518, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7416666805744171, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 560.7937622070312, "epoch": 0.7978876620259241, "grad_norm": 0.19371792674064636, "kl": 0.25802323296666146, "learning_rate": 2.385244724456256e-06, "loss": 0.0859, "reward": 1.8411459088325501, "reward_std": 0.2050497278571129, "rewards/accuracy_reward": 0.13958333767950534, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7432291746139527, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 551.6458557128906, "epoch": 0.7982077132341174, "grad_norm": 0.23767606914043427, "kl": 0.22985709607601165, "learning_rate": 2.378005225268689e-06, "loss": 0.0946, "reward": 1.7791666984558105, "reward_std": 0.22321388572454454, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7395833551883697, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 556.8916839599609, "epoch": 0.7985277644423108, "grad_norm": 0.10459578037261963, "kl": 0.18133811727166177, "learning_rate": 2.3707752462939137e-06, "loss": 0.0511, "reward": 1.8338542222976684, "reward_std": 0.12393696308135986, "rewards/accuracy_reward": 0.11041666995733976, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7463541865348816, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 528.3854370117188, "epoch": 0.7988478156505041, "grad_norm": 0.14623679220676422, "kl": 0.2732238922268152, "learning_rate": 2.363554796562498e-06, "loss": 0.0925, "reward": 1.7770833730697633, "reward_std": 0.19980213195085525, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7395833492279053, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 555.018765258789, "epoch": 0.7991678668586973, "grad_norm": 0.16956394910812378, "kl": 0.25806930400431155, "learning_rate": 2.3563438850931076e-06, "loss": 0.08, "reward": 1.8343750357627868, "reward_std": 0.16371086835861207, "rewards/accuracy_reward": 0.12708333730697632, "rewards/format_reward": 0.9666666746139526, "rewards/tag_count_reward": 0.7406250238418579, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 551.739599609375, "epoch": 0.7994879180668907, "grad_norm": 0.2760685086250305, "kl": 0.35679190531373023, "learning_rate": 2.3491425208924934e-06, "loss": 0.0844, "reward": 1.7109375476837159, "reward_std": 0.2751652516424656, "rewards/accuracy_reward": 0.03125000037252903, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7338541865348815, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 581.708349609375, "epoch": 0.799807969275084, "grad_norm": 0.12274476140737534, "kl": 0.22531968206167222, "learning_rate": 2.341950712955481e-06, "loss": 0.0701, "reward": 1.8500000476837157, "reward_std": 0.16242174208164215, "rewards/accuracy_reward": 0.13750000447034835, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7458333551883698, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 550.5625152587891, "epoch": 0.8001280204832774, "grad_norm": 0.14290136098861694, "kl": 0.24403711333870887, "learning_rate": 2.334768470264963e-06, "loss": 0.0905, "reward": 1.7848958730697633, "reward_std": 0.2028984658420086, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7411458551883697, "step": 2500 }, { "clip_ratio": 0.0, "completion_length": 551.9437744140625, "epoch": 0.8004480716914706, "grad_norm": 0.18500693142414093, "kl": 0.35512991696596147, "learning_rate": 2.3275958017918787e-06, "loss": 0.0749, "reward": 1.803645873069763, "reward_std": 0.23550339192152023, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7390625178813934, "step": 2501 }, { "clip_ratio": 0.0, "completion_length": 556.1937744140625, "epoch": 0.8007681228996639, "grad_norm": 0.10297524929046631, "kl": 0.1982058696448803, "learning_rate": 2.3204327164952135e-06, "loss": 0.0697, "reward": 1.7526041984558105, "reward_std": 0.10015429258346557, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7442708551883698, "step": 2502 }, { "clip_ratio": 0.0, "completion_length": 558.1083557128907, "epoch": 0.8010881741078573, "grad_norm": 0.20120938122272491, "kl": 0.24297235794365407, "learning_rate": 2.3132792233219814e-06, "loss": 0.0918, "reward": 1.8713542342185974, "reward_std": 0.19600575640797616, "rewards/accuracy_reward": 0.166666672937572, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7401041865348816, "step": 2503 }, { "clip_ratio": 0.0, "completion_length": 536.5979370117187, "epoch": 0.8014082253160506, "grad_norm": 0.0977979525923729, "kl": 0.362245024740696, "learning_rate": 2.3061353312072166e-06, "loss": 0.1023, "reward": 1.7375000357627868, "reward_std": 0.212438702583313, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.737500011920929, "step": 2504 }, { "clip_ratio": 0.0, "completion_length": 562.7250183105468, "epoch": 0.8017282765242438, "grad_norm": 0.15742014348506927, "kl": 0.16011352837085724, "learning_rate": 2.29900104907396e-06, "loss": 0.0661, "reward": 1.7265625238418578, "reward_std": 0.1467660591006279, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7390625178813934, "step": 2505 }, { "clip_ratio": 0.0, "completion_length": 525.1125122070313, "epoch": 0.8020483277324372, "grad_norm": 0.10794021189212799, "kl": 0.18489291295409202, "learning_rate": 2.2918763858332503e-06, "loss": 0.1016, "reward": 1.8979167222976685, "reward_std": 0.21962611973285676, "rewards/accuracy_reward": 0.1875000052154064, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7416666865348815, "step": 2506 }, { "clip_ratio": 0.0, "completion_length": 554.4854370117188, "epoch": 0.8023683789406305, "grad_norm": 0.2276093065738678, "kl": 0.35973372757434846, "learning_rate": 2.2847613503841094e-06, "loss": 0.0942, "reward": 1.7432291984558106, "reward_std": 0.1830192506313324, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7348958432674408, "step": 2507 }, { "clip_ratio": 0.0, "completion_length": 530.9479339599609, "epoch": 0.8026884301488239, "grad_norm": 0.23760341107845306, "kl": 0.18100916631519795, "learning_rate": 2.2776559516135354e-06, "loss": 0.0375, "reward": 1.767187523841858, "reward_std": 0.12768873944878578, "rewards/accuracy_reward": 0.04583333358168602, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7442708432674408, "step": 2508 }, { "clip_ratio": 0.0, "completion_length": 553.4854370117188, "epoch": 0.8030084813570171, "grad_norm": 0.2181500494480133, "kl": 0.23914669267833233, "learning_rate": 2.2705601983964933e-06, "loss": 0.0716, "reward": 1.8031250596046449, "reward_std": 0.18591777086257935, "rewards/accuracy_reward": 0.09375000428408384, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.740625011920929, "step": 2509 }, { "clip_ratio": 0.0, "completion_length": 568.8729278564454, "epoch": 0.8033285325652104, "grad_norm": 0.16643783450126648, "kl": 0.3333664506673813, "learning_rate": 2.2634740995958904e-06, "loss": 0.1014, "reward": 1.7885417103767396, "reward_std": 0.21033181324601175, "rewards/accuracy_reward": 0.08541667275130749, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7406250178813935, "step": 2510 }, { "clip_ratio": 0.0, "completion_length": 547.7166809082031, "epoch": 0.8036485837734038, "grad_norm": 0.20102167129516602, "kl": 0.16737534031271933, "learning_rate": 2.256397664062584e-06, "loss": 0.0671, "reward": 1.7489583611488342, "reward_std": 0.15768709704279898, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7427083492279053, "step": 2511 }, { "clip_ratio": 0.0, "completion_length": 560.3250183105469, "epoch": 0.8039686349815971, "grad_norm": 0.16902852058410645, "kl": 0.383270151168108, "learning_rate": 2.249330900635359e-06, "loss": 0.0886, "reward": 1.8244792342185974, "reward_std": 0.20306602343916894, "rewards/accuracy_reward": 0.11875000372529029, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7411458492279053, "step": 2512 }, { "clip_ratio": 0.0, "completion_length": 534.4354309082031, "epoch": 0.8042886861897903, "grad_norm": 0.10241694003343582, "kl": 0.26669327914714813, "learning_rate": 2.242273818140921e-06, "loss": 0.0803, "reward": 1.7666666865348817, "reward_std": 0.2016952320933342, "rewards/accuracy_reward": 0.06041666828095913, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7375000238418579, "step": 2513 }, { "clip_ratio": 0.0, "completion_length": 574.0687683105468, "epoch": 0.8046087373979837, "grad_norm": 0.10145855695009232, "kl": 0.20489286333322526, "learning_rate": 2.2352264253938795e-06, "loss": 0.0594, "reward": 1.7677083730697631, "reward_std": 0.1323221020400524, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7364583492279053, "step": 2514 }, { "clip_ratio": 0.0, "completion_length": 564.4312805175781, "epoch": 0.804928788606177, "grad_norm": 0.16253073513507843, "kl": 0.16106326691806316, "learning_rate": 2.2281887311967454e-06, "loss": 0.0621, "reward": 1.7890625238418578, "reward_std": 0.14671893119812013, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7453125059604645, "step": 2515 }, { "clip_ratio": 0.0, "completion_length": 536.4541870117188, "epoch": 0.8052488398143703, "grad_norm": 0.2187085896730423, "kl": 0.2485386922955513, "learning_rate": 2.221160744339913e-06, "loss": 0.0566, "reward": 1.8010417222976685, "reward_std": 0.17143189162015915, "rewards/accuracy_reward": 0.09166666772216558, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7406250059604644, "step": 2516 }, { "clip_ratio": 0.0, "completion_length": 530.2645965576172, "epoch": 0.8055688910225636, "grad_norm": 0.1277468055486679, "kl": 0.27220593765378, "learning_rate": 2.214142473601657e-06, "loss": 0.1063, "reward": 1.7572916865348815, "reward_std": 0.19907682836055757, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7322916805744171, "step": 2517 }, { "clip_ratio": 0.0, "completion_length": 564.3958557128906, "epoch": 0.8058889422307569, "grad_norm": 0.5361232757568359, "kl": 0.2836416274309158, "learning_rate": 2.207133927748104e-06, "loss": 0.1048, "reward": 1.7906250357627869, "reward_std": 0.25850230678915975, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7343750238418579, "step": 2518 }, { "clip_ratio": 0.0, "completion_length": 540.8479309082031, "epoch": 0.8062089934389502, "grad_norm": 0.21963047981262207, "kl": 0.1898048844188452, "learning_rate": 2.2001351155332453e-06, "loss": 0.0493, "reward": 1.7864583730697632, "reward_std": 0.15694307088851928, "rewards/accuracy_reward": 0.0708333358168602, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7447916805744171, "step": 2519 }, { "clip_ratio": 0.0, "completion_length": 540.4979309082031, "epoch": 0.8065290446471436, "grad_norm": 0.1178651824593544, "kl": 0.20024344846606254, "learning_rate": 2.1931460456989105e-06, "loss": 0.0952, "reward": 1.7979167222976684, "reward_std": 0.18103850185871123, "rewards/accuracy_reward": 0.0916666690260172, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7437500298023224, "step": 2520 }, { "clip_ratio": 0.0, "completion_length": 544.7208465576172, "epoch": 0.8068490958553368, "grad_norm": 0.20870809257030487, "kl": 0.30348594933748246, "learning_rate": 2.1861667269747623e-06, "loss": 0.0773, "reward": 1.7843750596046448, "reward_std": 0.21300148218870163, "rewards/accuracy_reward": 0.09375000223517418, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7385416865348816, "step": 2521 }, { "clip_ratio": 0.0, "completion_length": 572.870849609375, "epoch": 0.8071691470635302, "grad_norm": 0.12147420644760132, "kl": 0.24489268735051156, "learning_rate": 2.179197168078281e-06, "loss": 0.0662, "reward": 1.8171875476837158, "reward_std": 0.13280707448720933, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7421875119209289, "step": 2522 }, { "clip_ratio": 0.0, "completion_length": 577.9146026611328, "epoch": 0.8074891982717235, "grad_norm": 0.20473752915859222, "kl": 0.33478925600647924, "learning_rate": 2.1722373777147574e-06, "loss": 0.1407, "reward": 1.784375047683716, "reward_std": 0.27957783192396163, "rewards/accuracy_reward": 0.10833333358168602, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7322916865348816, "step": 2523 }, { "clip_ratio": 0.0, "completion_length": 555.5479309082032, "epoch": 0.8078092494799168, "grad_norm": 0.232964888215065, "kl": 0.33908804357051847, "learning_rate": 2.165287364577282e-06, "loss": 0.1077, "reward": 1.823958396911621, "reward_std": 0.23599036335945128, "rewards/accuracy_reward": 0.11666667070239782, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7385416924953461, "step": 2524 }, { "clip_ratio": 0.0, "completion_length": 561.8104370117187, "epoch": 0.8081293006881101, "grad_norm": 0.17493554949760437, "kl": 0.20149580687284468, "learning_rate": 2.158347137346736e-06, "loss": 0.0876, "reward": 1.7171875357627868, "reward_std": 0.20723508298397064, "rewards/accuracy_reward": 0.020833334513008596, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7380208551883698, "step": 2525 }, { "clip_ratio": 0.0, "completion_length": 560.0270965576171, "epoch": 0.8084493518963034, "grad_norm": 0.23425406217575073, "kl": 0.36248365715146064, "learning_rate": 2.1514167046917666e-06, "loss": 0.1021, "reward": 1.7708333611488343, "reward_std": 0.24469319060444833, "rewards/accuracy_reward": 0.07916666828095913, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7416666865348815, "step": 2526 }, { "clip_ratio": 0.0, "completion_length": 583.4312683105469, "epoch": 0.8087694031044967, "grad_norm": 0.35288989543914795, "kl": 0.47020969688892367, "learning_rate": 2.1444960752687994e-06, "loss": 0.1475, "reward": 1.7546875476837158, "reward_std": 0.30719054490327835, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.931250023841858, "rewards/tag_count_reward": 0.7296875178813934, "step": 2527 }, { "clip_ratio": 0.0, "completion_length": 555.0812622070313, "epoch": 0.8090894543126901, "grad_norm": 0.14612969756126404, "kl": 0.1977105811238289, "learning_rate": 2.1375852577220078e-06, "loss": 0.0789, "reward": 1.7734375476837159, "reward_std": 0.1774771437048912, "rewards/accuracy_reward": 0.06041667014360428, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7442708492279053, "step": 2528 }, { "clip_ratio": 0.0, "completion_length": 550.5041931152343, "epoch": 0.8094095055208833, "grad_norm": 0.1703653633594513, "kl": 0.4641824632883072, "learning_rate": 2.1306842606833157e-06, "loss": 0.0848, "reward": 1.7531250238418579, "reward_std": 0.22379239052534103, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7343750238418579, "step": 2529 }, { "clip_ratio": 0.0, "completion_length": 578.2125213623046, "epoch": 0.8097295567290766, "grad_norm": 0.2629508078098297, "kl": 0.30261474251747134, "learning_rate": 2.1237930927723736e-06, "loss": 0.1067, "reward": 1.7213541984558105, "reward_std": 0.249358981102705, "rewards/accuracy_reward": 0.033333334140479565, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7338541865348815, "step": 2530 }, { "clip_ratio": 0.0, "completion_length": 570.2291870117188, "epoch": 0.81004960793727, "grad_norm": 0.15494459867477417, "kl": 0.3502315230667591, "learning_rate": 2.116911762596563e-06, "loss": 0.1289, "reward": 1.7026042103767396, "reward_std": 0.21801614612340928, "rewards/accuracy_reward": 0.016666667722165584, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7359375178813934, "step": 2531 }, { "clip_ratio": 0.0, "completion_length": 561.8021057128906, "epoch": 0.8103696591454633, "grad_norm": 0.11083235591650009, "kl": 0.14325247332453728, "learning_rate": 2.11004027875097e-06, "loss": 0.0964, "reward": 1.7223958849906922, "reward_std": 0.18362916633486748, "rewards/accuracy_reward": 0.01250000037252903, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7411458432674408, "step": 2532 }, { "clip_ratio": 0.0, "completion_length": 569.0771026611328, "epoch": 0.8106897103536566, "grad_norm": 0.1683712750673294, "kl": 0.2535912752151489, "learning_rate": 2.103178649818387e-06, "loss": 0.075, "reward": 1.7786458730697632, "reward_std": 0.1779824249446392, "rewards/accuracy_reward": 0.07291666977107525, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7411458492279053, "step": 2533 }, { "clip_ratio": 0.0, "completion_length": 569.4458557128906, "epoch": 0.8110097615618499, "grad_norm": 0.12049499154090881, "kl": 0.2638672016561031, "learning_rate": 2.0963268843692986e-06, "loss": 0.0826, "reward": 1.7864583611488343, "reward_std": 0.22827504426240922, "rewards/accuracy_reward": 0.10416667014360428, "rewards/format_reward": 0.943750011920929, "rewards/tag_count_reward": 0.7385416805744172, "step": 2534 }, { "clip_ratio": 0.0, "completion_length": 549.2791809082031, "epoch": 0.8113298127700432, "grad_norm": 0.11559872329235077, "kl": 0.2065212272107601, "learning_rate": 2.089484990961862e-06, "loss": 0.0803, "reward": 1.8260417103767395, "reward_std": 0.1757136031985283, "rewards/accuracy_reward": 0.11875000353902579, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7406250238418579, "step": 2535 }, { "clip_ratio": 0.0, "completion_length": 573.2187805175781, "epoch": 0.8116498639782365, "grad_norm": 0.284346342086792, "kl": 0.4133388787508011, "learning_rate": 2.0826529781419092e-06, "loss": 0.0991, "reward": 1.692187535762787, "reward_std": 0.17340034022927284, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7421875178813935, "step": 2536 }, { "clip_ratio": 0.0, "completion_length": 549.2333435058594, "epoch": 0.8119699151864298, "grad_norm": 0.14568276703357697, "kl": 0.20174489133059978, "learning_rate": 2.0758308544429317e-06, "loss": 0.0962, "reward": 1.7604166865348816, "reward_std": 0.16792564019560813, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.743750023841858, "step": 2537 }, { "clip_ratio": 0.0, "completion_length": 548.8271026611328, "epoch": 0.8122899663946231, "grad_norm": 0.11698803305625916, "kl": 0.19524867199361323, "learning_rate": 2.069018628386067e-06, "loss": 0.0881, "reward": 1.8166666984558106, "reward_std": 0.21987561136484146, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7416666805744171, "step": 2538 }, { "clip_ratio": 0.0, "completion_length": 581.2583557128906, "epoch": 0.8126100176028165, "grad_norm": 0.35214290022850037, "kl": 0.5445257410407066, "learning_rate": 2.0622163084800904e-06, "loss": 0.0919, "reward": 1.814583420753479, "reward_std": 0.24731214791536332, "rewards/accuracy_reward": 0.13125000707805157, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7354166746139527, "step": 2539 }, { "clip_ratio": 0.0, "completion_length": 551.2666870117188, "epoch": 0.8129300688110097, "grad_norm": 0.15203234553337097, "kl": 0.2172985278069973, "learning_rate": 2.055423903221404e-06, "loss": 0.0675, "reward": 1.7791666984558105, "reward_std": 0.16751005351543427, "rewards/accuracy_reward": 0.06875000037252903, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666865348815, "step": 2540 }, { "clip_ratio": 0.0, "completion_length": 556.4229370117188, "epoch": 0.813250120019203, "grad_norm": 0.20225732028484344, "kl": 0.22776147164404392, "learning_rate": 2.0486414210940266e-06, "loss": 0.0509, "reward": 1.9161458730697631, "reward_std": 0.17580842301249505, "rewards/accuracy_reward": 0.19791666977107525, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7432291805744171, "step": 2541 }, { "clip_ratio": 0.0, "completion_length": 546.5646026611328, "epoch": 0.8135701712273964, "grad_norm": 0.18625636398792267, "kl": 0.4055021218955517, "learning_rate": 2.0418688705695846e-06, "loss": 0.1084, "reward": 1.7718750596046449, "reward_std": 0.2145461067557335, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7427083492279053, "step": 2542 }, { "clip_ratio": 0.0, "completion_length": 544.064599609375, "epoch": 0.8138902224355897, "grad_norm": 0.19601131975650787, "kl": 0.347163225710392, "learning_rate": 2.035106260107291e-06, "loss": 0.0904, "reward": 1.7125000357627869, "reward_std": 0.23749838918447494, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7375000238418579, "step": 2543 }, { "clip_ratio": 0.0, "completion_length": 579.9958557128906, "epoch": 0.814210273643783, "grad_norm": 0.11947452276945114, "kl": 0.39595833867788316, "learning_rate": 2.0283535981539537e-06, "loss": 0.1061, "reward": 1.7432291984558106, "reward_std": 0.21747851520776748, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7307291865348816, "step": 2544 }, { "clip_ratio": 0.0, "completion_length": 584.2062622070313, "epoch": 0.8145303248519763, "grad_norm": 0.19575047492980957, "kl": 0.31752732023596764, "learning_rate": 2.021610893143947e-06, "loss": 0.0987, "reward": 1.9182292222976685, "reward_std": 0.2564608708024025, "rewards/accuracy_reward": 0.22291667740792037, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.736979192495346, "step": 2545 }, { "clip_ratio": 0.0, "completion_length": 548.0791778564453, "epoch": 0.8148503760601696, "grad_norm": 0.15424421429634094, "kl": 0.26091369315981866, "learning_rate": 2.0148781534992135e-06, "loss": 0.0926, "reward": 1.7901041984558106, "reward_std": 0.20669013559818267, "rewards/accuracy_reward": 0.09375000204890967, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7359375178813934, "step": 2546 }, { "clip_ratio": 0.0, "completion_length": 535.2270965576172, "epoch": 0.815170427268363, "grad_norm": 0.08642657101154327, "kl": 0.2408471204340458, "learning_rate": 2.008155387629245e-06, "loss": 0.0906, "reward": 1.7385416865348815, "reward_std": 0.22244496196508406, "rewards/accuracy_reward": 0.037500000186264516, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250178813935, "step": 2547 }, { "clip_ratio": 0.0, "completion_length": 557.0687744140625, "epoch": 0.8154904784765562, "grad_norm": 0.19214469194412231, "kl": 0.431305243819952, "learning_rate": 2.0014426039310786e-06, "loss": 0.1049, "reward": 1.7687500357627868, "reward_std": 0.22442566826939583, "rewards/accuracy_reward": 0.08750000316649675, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7354166865348816, "step": 2548 }, { "clip_ratio": 0.0, "completion_length": 575.308349609375, "epoch": 0.8158105296847495, "grad_norm": 0.12344229966402054, "kl": 0.21020562946796417, "learning_rate": 1.9947398107892813e-06, "loss": 0.0675, "reward": 1.7697917103767395, "reward_std": 0.1644446700811386, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7427083492279053, "step": 2549 }, { "clip_ratio": 0.0, "completion_length": 577.8750183105469, "epoch": 0.8161305808929429, "grad_norm": 0.10697629302740097, "kl": 0.2528723068535328, "learning_rate": 1.9880470165759436e-06, "loss": 0.0409, "reward": 1.7723958611488342, "reward_std": 0.161284402012825, "rewards/accuracy_reward": 0.056250000931322576, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7453125178813934, "step": 2550 }, { "clip_ratio": 0.0, "completion_length": 566.681265258789, "epoch": 0.8164506321011362, "grad_norm": 0.1915271282196045, "kl": 0.28193242400884627, "learning_rate": 1.9813642296506606e-06, "loss": 0.0633, "reward": 1.7947917103767395, "reward_std": 0.15104105249047278, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.740625011920929, "step": 2551 }, { "clip_ratio": 0.0, "completion_length": 538.8458557128906, "epoch": 0.8167706833093294, "grad_norm": 0.12123015522956848, "kl": 0.2618663445115089, "learning_rate": 1.974691458360536e-06, "loss": 0.0738, "reward": 1.859895896911621, "reward_std": 0.25564419105648994, "rewards/accuracy_reward": 0.15833334177732467, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7390625238418579, "step": 2552 }, { "clip_ratio": 0.0, "completion_length": 579.714599609375, "epoch": 0.8170907345175228, "grad_norm": 0.12185723334550858, "kl": 0.23866596780717372, "learning_rate": 1.9680287110401584e-06, "loss": 0.0666, "reward": 1.7625000476837158, "reward_std": 0.21922733038663864, "rewards/accuracy_reward": 0.058333336189389226, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7395833551883697, "step": 2553 }, { "clip_ratio": 0.0, "completion_length": 579.758349609375, "epoch": 0.8174107857257161, "grad_norm": 0.1164194792509079, "kl": 0.2658846389502287, "learning_rate": 1.9613759960115986e-06, "loss": 0.0811, "reward": 1.765625035762787, "reward_std": 0.2186833456158638, "rewards/accuracy_reward": 0.06666666939854622, "rewards/format_reward": 0.9583333432674408, "rewards/tag_count_reward": 0.7406250178813935, "step": 2554 }, { "clip_ratio": 0.0, "completion_length": 552.0395935058593, "epoch": 0.8177308369339095, "grad_norm": 0.08748471736907959, "kl": 0.1893336571753025, "learning_rate": 1.9547333215843945e-06, "loss": 0.087, "reward": 1.746875035762787, "reward_std": 0.17420026957988738, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7427083492279053, "step": 2555 }, { "clip_ratio": 0.0, "completion_length": 565.8479400634766, "epoch": 0.8180508881421027, "grad_norm": 0.14459647238254547, "kl": 0.3010967392474413, "learning_rate": 1.948100696055545e-06, "loss": 0.0752, "reward": 1.7864583730697632, "reward_std": 0.20786819905042647, "rewards/accuracy_reward": 0.08333333544433116, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7385416805744172, "step": 2556 }, { "clip_ratio": 0.0, "completion_length": 566.6166931152344, "epoch": 0.818370939350296, "grad_norm": 0.10996969789266586, "kl": 0.19597923345863819, "learning_rate": 1.9414781277094963e-06, "loss": 0.0468, "reward": 1.7760416865348816, "reward_std": 0.14172435849905013, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7406250178813935, "step": 2557 }, { "clip_ratio": 0.0, "completion_length": 571.2666870117188, "epoch": 0.8186909905584894, "grad_norm": 0.13527487218379974, "kl": 0.27192542925477026, "learning_rate": 1.934865624818132e-06, "loss": 0.0987, "reward": 1.7984375476837158, "reward_std": 0.22232221812009811, "rewards/accuracy_reward": 0.10000000447034836, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7380208432674408, "step": 2558 }, { "clip_ratio": 0.0, "completion_length": 562.4854370117188, "epoch": 0.8190110417666827, "grad_norm": 0.10489752143621445, "kl": 0.21075959838926792, "learning_rate": 1.928263195640767e-06, "loss": 0.0593, "reward": 1.8062500476837158, "reward_std": 0.19212491065263748, "rewards/accuracy_reward": 0.10416667293757201, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7375000178813934, "step": 2559 }, { "clip_ratio": 0.0, "completion_length": 569.1125305175781, "epoch": 0.8193310929748759, "grad_norm": 0.1237000972032547, "kl": 0.3519850574433804, "learning_rate": 1.9216708484241275e-06, "loss": 0.0607, "reward": 1.7927083849906922, "reward_std": 0.20241789817810057, "rewards/accuracy_reward": 0.08541666772216558, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7406250178813935, "step": 2560 }, { "clip_ratio": 0.0, "completion_length": 575.5437744140625, "epoch": 0.8196511441830693, "grad_norm": 0.09780974686145782, "kl": 0.2098309613764286, "learning_rate": 1.915088591402351e-06, "loss": 0.0873, "reward": 1.7880208849906922, "reward_std": 0.2020464301109314, "rewards/accuracy_reward": 0.09166666865348816, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7401041805744171, "step": 2561 }, { "clip_ratio": 0.0, "completion_length": 576.989599609375, "epoch": 0.8199711953912626, "grad_norm": 0.13586272299289703, "kl": 0.30812819600105285, "learning_rate": 1.908516432796973e-06, "loss": 0.0929, "reward": 1.7786458730697632, "reward_std": 0.22650991678237914, "rewards/accuracy_reward": 0.08125000055879354, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7390625178813934, "step": 2562 }, { "clip_ratio": 0.0, "completion_length": 568.8166931152343, "epoch": 0.820291246599456, "grad_norm": 0.10003126412630081, "kl": 0.21191044226288797, "learning_rate": 1.9019543808169117e-06, "loss": 0.0489, "reward": 1.7963542342185974, "reward_std": 0.19010281562805176, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7401041924953461, "step": 2563 }, { "clip_ratio": 0.0, "completion_length": 546.8041870117188, "epoch": 0.8206112978076492, "grad_norm": 0.1322830468416214, "kl": 0.2218364529311657, "learning_rate": 1.895402443658465e-06, "loss": 0.0564, "reward": 1.7718750357627868, "reward_std": 0.1890696920454502, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.740625011920929, "step": 2564 }, { "clip_ratio": 0.0, "completion_length": 540.5375183105468, "epoch": 0.8209313490158425, "grad_norm": 0.1352192908525467, "kl": 0.29029730148613453, "learning_rate": 1.888860629505297e-06, "loss": 0.0376, "reward": 1.7395833611488343, "reward_std": 0.1509907476603985, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7416666805744171, "step": 2565 }, { "clip_ratio": 0.0, "completion_length": 561.2833435058594, "epoch": 0.8212514002240359, "grad_norm": 0.1863587647676468, "kl": 0.18537102192640303, "learning_rate": 1.8823289465284244e-06, "loss": 0.083, "reward": 1.7229166865348815, "reward_std": 0.16097078919410707, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7416666865348815, "step": 2566 }, { "clip_ratio": 0.0, "completion_length": 573.0083526611328, "epoch": 0.8215714514322292, "grad_norm": 0.08944018930196762, "kl": 0.20662855319678783, "learning_rate": 1.8758074028862161e-06, "loss": 0.0705, "reward": 1.7895833730697632, "reward_std": 0.1706282950937748, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666865348815, "step": 2567 }, { "clip_ratio": 0.0, "completion_length": 561.0479370117188, "epoch": 0.8218915026404224, "grad_norm": 0.29065433144569397, "kl": 0.29194722771644593, "learning_rate": 1.869296006724366e-06, "loss": 0.0541, "reward": 1.7531250357627868, "reward_std": 0.21704104840755462, "rewards/accuracy_reward": 0.05416666883975267, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7385416865348816, "step": 2568 }, { "clip_ratio": 0.0, "completion_length": 586.064599609375, "epoch": 0.8222115538486158, "grad_norm": 0.16143926978111267, "kl": 0.3712439864873886, "learning_rate": 1.8627947661759027e-06, "loss": 0.1243, "reward": 1.7343750238418578, "reward_std": 0.22560944259166718, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7364583671092987, "step": 2569 }, { "clip_ratio": 0.0, "completion_length": 569.4041809082031, "epoch": 0.8225316050568091, "grad_norm": 0.15806716680526733, "kl": 0.2439609609544277, "learning_rate": 1.8563036893611664e-06, "loss": 0.0698, "reward": 1.7145833611488341, "reward_std": 0.14413456842303277, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666805744171, "step": 2570 }, { "clip_ratio": 0.0, "completion_length": 554.1458465576172, "epoch": 0.8228516562650025, "grad_norm": 0.10522006452083588, "kl": 0.23359978944063187, "learning_rate": 1.8498227843878025e-06, "loss": 0.0774, "reward": 1.7682292103767394, "reward_std": 0.18255340680480003, "rewards/accuracy_reward": 0.06458333507180214, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7390625238418579, "step": 2571 }, { "clip_ratio": 0.0, "completion_length": 571.4062744140625, "epoch": 0.8231717074731957, "grad_norm": 0.12738092243671417, "kl": 0.4491455115377903, "learning_rate": 1.8433520593507515e-06, "loss": 0.1236, "reward": 1.7473958730697632, "reward_std": 0.22424405813217163, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.739062511920929, "step": 2572 }, { "clip_ratio": 0.0, "completion_length": 577.6562713623047, "epoch": 0.823491758681389, "grad_norm": 0.184599369764328, "kl": 0.6130537793040276, "learning_rate": 1.8368915223322392e-06, "loss": 0.1159, "reward": 1.7661458730697632, "reward_std": 0.26158987879753115, "rewards/accuracy_reward": 0.0916666690260172, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.732812511920929, "step": 2573 }, { "clip_ratio": 0.0, "completion_length": 573.7875183105468, "epoch": 0.8238118098895824, "grad_norm": 0.15006475150585175, "kl": 0.25845105201005936, "learning_rate": 1.8304411814017654e-06, "loss": 0.0903, "reward": 1.752083384990692, "reward_std": 0.20691974982619285, "rewards/accuracy_reward": 0.05625000260770321, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.735416692495346, "step": 2574 }, { "clip_ratio": 0.0, "completion_length": 579.2041870117188, "epoch": 0.8241318610977757, "grad_norm": 0.20258435606956482, "kl": 0.5631472624838352, "learning_rate": 1.8240010446160973e-06, "loss": 0.1084, "reward": 1.7848958849906922, "reward_std": 0.27376395016908645, "rewards/accuracy_reward": 0.10625000149011612, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7307291865348816, "step": 2575 }, { "clip_ratio": 0.0, "completion_length": 574.4166961669922, "epoch": 0.8244519123059689, "grad_norm": 0.1155025064945221, "kl": 0.31104291453957555, "learning_rate": 1.817571120019248e-06, "loss": 0.0822, "reward": 1.8125000596046448, "reward_std": 0.18279174268245696, "rewards/accuracy_reward": 0.11458333637565374, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7395833551883697, "step": 2576 }, { "clip_ratio": 0.0, "completion_length": 609.0000183105469, "epoch": 0.8247719635141623, "grad_norm": 0.17100323736667633, "kl": 0.2843918614089489, "learning_rate": 1.811151415642487e-06, "loss": 0.0943, "reward": 1.7473958730697632, "reward_std": 0.21448913291096688, "rewards/accuracy_reward": 0.03958333469927311, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.739062511920929, "step": 2577 }, { "clip_ratio": 0.0, "completion_length": 567.7916809082031, "epoch": 0.8250920147223556, "grad_norm": 0.10874886065721512, "kl": 0.35035271644592286, "learning_rate": 1.8047419395043086e-06, "loss": 0.1229, "reward": 1.7109375476837159, "reward_std": 0.23503211364150048, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9333333492279052, "rewards/tag_count_reward": 0.735937523841858, "step": 2578 }, { "clip_ratio": 0.0, "completion_length": 588.4750244140625, "epoch": 0.8254120659305488, "grad_norm": 0.11109782010316849, "kl": 0.42124783545732497, "learning_rate": 1.798342699610438e-06, "loss": 0.1023, "reward": 1.707812523841858, "reward_std": 0.25911448076367377, "rewards/accuracy_reward": 0.03125000111758709, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7286458492279053, "step": 2579 }, { "clip_ratio": 0.0, "completion_length": 594.520849609375, "epoch": 0.8257321171387422, "grad_norm": 0.19008983671665192, "kl": 0.3546397894620895, "learning_rate": 1.7919537039538127e-06, "loss": 0.1139, "reward": 1.7692708611488341, "reward_std": 0.23377685472369195, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7338541805744171, "step": 2580 }, { "clip_ratio": 0.0, "completion_length": 552.2687744140625, "epoch": 0.8260521683469355, "grad_norm": 0.2240857034921646, "kl": 0.29563959799706935, "learning_rate": 1.7855749605145722e-06, "loss": 0.0933, "reward": 1.7614583730697633, "reward_std": 0.18812100738286971, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.7385416865348816, "step": 2581 }, { "clip_ratio": 0.0, "completion_length": 566.7458435058594, "epoch": 0.8263722195551289, "grad_norm": 0.28932392597198486, "kl": 0.3456702195107937, "learning_rate": 1.7792064772600547e-06, "loss": 0.0919, "reward": 1.7651042222976685, "reward_std": 0.229879729449749, "rewards/accuracy_reward": 0.07083333563059568, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7338541805744171, "step": 2582 }, { "clip_ratio": 0.0, "completion_length": 583.4062683105469, "epoch": 0.8266922707633221, "grad_norm": 0.13011673092842102, "kl": 0.16959348358213902, "learning_rate": 1.7728482621447818e-06, "loss": 0.0893, "reward": 1.7395833611488343, "reward_std": 0.1753227561712265, "rewards/accuracy_reward": 0.03125000204890967, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.743750023841858, "step": 2583 }, { "clip_ratio": 0.0, "completion_length": 580.802099609375, "epoch": 0.8270123219715154, "grad_norm": 0.16126485168933868, "kl": 0.21559477150440215, "learning_rate": 1.766500323110445e-06, "loss": 0.051, "reward": 1.8192708969116211, "reward_std": 0.15779685974121094, "rewards/accuracy_reward": 0.10416667088866234, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7442708611488342, "step": 2584 }, { "clip_ratio": 0.0, "completion_length": 574.8729370117187, "epoch": 0.8273323731797088, "grad_norm": 0.28615960478782654, "kl": 0.29707399681210517, "learning_rate": 1.7601626680859073e-06, "loss": 0.1335, "reward": 1.8328125476837158, "reward_std": 0.2527454063296318, "rewards/accuracy_reward": 0.14791667070239783, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7348958611488342, "step": 2585 }, { "clip_ratio": 0.0, "completion_length": 581.9645935058594, "epoch": 0.8276524243879021, "grad_norm": 0.10823047906160355, "kl": 0.22162544690072536, "learning_rate": 1.7538353049871826e-06, "loss": 0.0416, "reward": 1.7625000119209289, "reward_std": 0.140020003169775, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7437500178813934, "step": 2586 }, { "clip_ratio": 0.0, "completion_length": 557.1250183105469, "epoch": 0.8279724755960953, "grad_norm": 0.19290484488010406, "kl": 0.16696857735514642, "learning_rate": 1.7475182417174318e-06, "loss": 0.0861, "reward": 1.794270884990692, "reward_std": 0.22950992360711098, "rewards/accuracy_reward": 0.08333333656191826, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7401041865348816, "step": 2587 }, { "clip_ratio": 0.0, "completion_length": 545.9187683105469, "epoch": 0.8282925268042887, "grad_norm": 0.10784605145454407, "kl": 0.30665692016482354, "learning_rate": 1.7412114861669482e-06, "loss": 0.077, "reward": 1.8307292580604553, "reward_std": 0.21484979316592218, "rewards/accuracy_reward": 0.12708333600312471, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7369791865348816, "step": 2588 }, { "clip_ratio": 0.0, "completion_length": 585.3979431152344, "epoch": 0.828612578012482, "grad_norm": 0.2151651382446289, "kl": 0.41664362922310827, "learning_rate": 1.7349150462131536e-06, "loss": 0.0866, "reward": 1.8447917103767395, "reward_std": 0.29800059348344804, "rewards/accuracy_reward": 0.1562500050291419, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7343750178813935, "step": 2589 }, { "clip_ratio": 0.0, "completion_length": 563.6187622070313, "epoch": 0.8289326292206753, "grad_norm": 0.1969972550868988, "kl": 0.3401130996644497, "learning_rate": 1.7286289297205826e-06, "loss": 0.0593, "reward": 1.7770833611488341, "reward_std": 0.260301998257637, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7395833551883697, "step": 2590 }, { "clip_ratio": 0.0, "completion_length": 578.2979370117188, "epoch": 0.8292526804288686, "grad_norm": 0.27202510833740234, "kl": 0.33993891403079035, "learning_rate": 1.722353144540877e-06, "loss": 0.1264, "reward": 1.8276042222976685, "reward_std": 0.28314041793346406, "rewards/accuracy_reward": 0.15833333935588598, "rewards/format_reward": 0.9354166805744171, "rewards/tag_count_reward": 0.7338541924953461, "step": 2591 }, { "clip_ratio": 0.0, "completion_length": 589.4000305175781, "epoch": 0.8295727316370619, "grad_norm": 0.21395038068294525, "kl": 0.3511948026716709, "learning_rate": 1.716087698512775e-06, "loss": 0.0687, "reward": 1.7541667103767395, "reward_std": 0.21019265428185463, "rewards/accuracy_reward": 0.05416666921228171, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7395833432674408, "step": 2592 }, { "clip_ratio": 0.0, "completion_length": 563.2916839599609, "epoch": 0.8298927828452553, "grad_norm": 0.17267273366451263, "kl": 0.344217037782073, "learning_rate": 1.7098325994620934e-06, "loss": 0.0797, "reward": 1.856770884990692, "reward_std": 0.229797425866127, "rewards/accuracy_reward": 0.15000000447034836, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7401041805744171, "step": 2593 }, { "clip_ratio": 0.0, "completion_length": 586.6041870117188, "epoch": 0.8302128340534486, "grad_norm": 0.24830318987369537, "kl": 0.29451605267822745, "learning_rate": 1.703587855201736e-06, "loss": 0.0944, "reward": 1.7479166984558105, "reward_std": 0.18984657078981398, "rewards/accuracy_reward": 0.039583333395421506, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7437500178813934, "step": 2594 }, { "clip_ratio": 0.0, "completion_length": 580.0833557128906, "epoch": 0.8305328852616418, "grad_norm": 0.10851777344942093, "kl": 0.19102349653840064, "learning_rate": 1.6973534735316666e-06, "loss": 0.0364, "reward": 1.7375000238418579, "reward_std": 0.14052069038152695, "rewards/accuracy_reward": 0.027083333767950534, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7395833432674408, "step": 2595 }, { "clip_ratio": 0.0, "completion_length": 597.008349609375, "epoch": 0.8308529364698352, "grad_norm": 0.5246524810791016, "kl": 0.30549296662211417, "learning_rate": 1.6911294622389075e-06, "loss": 0.1013, "reward": 1.7406250357627868, "reward_std": 0.24924185127019882, "rewards/accuracy_reward": 0.06041666977107525, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7343750178813935, "step": 2596 }, { "clip_ratio": 0.0, "completion_length": 605.1958557128906, "epoch": 0.8311729876780285, "grad_norm": 0.10506986081600189, "kl": 0.30327147617936134, "learning_rate": 1.6849158290975277e-06, "loss": 0.0728, "reward": 1.7791667103767395, "reward_std": 0.21810988560318947, "rewards/accuracy_reward": 0.08958333544433117, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7354166805744171, "step": 2597 }, { "clip_ratio": 0.0, "completion_length": 593.0312774658203, "epoch": 0.8314930388862218, "grad_norm": 0.15811342000961304, "kl": 0.36188592612743375, "learning_rate": 1.6787125818686322e-06, "loss": 0.1335, "reward": 1.7531250357627868, "reward_std": 0.3249521702528, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.931250023841858, "rewards/tag_count_reward": 0.7281250178813934, "step": 2598 }, { "clip_ratio": 0.0, "completion_length": 567.2604370117188, "epoch": 0.8318130900944151, "grad_norm": 0.21360376477241516, "kl": 0.3731867164373398, "learning_rate": 1.6725197283003548e-06, "loss": 0.0885, "reward": 1.8067708730697631, "reward_std": 0.22768967226147652, "rewards/accuracy_reward": 0.10625000465661287, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7380208492279052, "step": 2599 }, { "clip_ratio": 0.0, "completion_length": 579.5771026611328, "epoch": 0.8321331413026084, "grad_norm": 0.18168602883815765, "kl": 0.3145070172846317, "learning_rate": 1.6663372761278507e-06, "loss": 0.0948, "reward": 1.7807292222976685, "reward_std": 0.18614666685461997, "rewards/accuracy_reward": 0.07916666977107525, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7390625178813934, "step": 2600 }, { "clip_ratio": 0.0, "completion_length": 558.2708435058594, "epoch": 0.8324531925108017, "grad_norm": 0.34099605679512024, "kl": 0.3224764481186867, "learning_rate": 1.6601652330732732e-06, "loss": 0.1071, "reward": 1.7057291984558105, "reward_std": 0.23296904936432838, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7328125178813935, "step": 2601 }, { "clip_ratio": 0.0, "completion_length": 575.7791931152344, "epoch": 0.8327732437189951, "grad_norm": 0.2730673849582672, "kl": 0.5597857162356377, "learning_rate": 1.6540036068457833e-06, "loss": 0.1304, "reward": 1.8041667103767396, "reward_std": 0.24048233777284622, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7333333492279053, "step": 2602 }, { "clip_ratio": 0.0, "completion_length": 561.2062774658203, "epoch": 0.8330932949271883, "grad_norm": 0.17329207062721252, "kl": 0.39218656048178674, "learning_rate": 1.647852405141529e-06, "loss": 0.105, "reward": 1.7619791984558106, "reward_std": 0.2654853545129299, "rewards/accuracy_reward": 0.07500000335276127, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7307291805744172, "step": 2603 }, { "clip_ratio": 0.0, "completion_length": 570.6500183105469, "epoch": 0.8334133461353816, "grad_norm": 0.16453422605991364, "kl": 0.42657639682292936, "learning_rate": 1.6417116356436348e-06, "loss": 0.0935, "reward": 1.7958333849906922, "reward_std": 0.2776599481701851, "rewards/accuracy_reward": 0.11041666977107525, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7312500238418579, "step": 2604 }, { "clip_ratio": 0.0, "completion_length": 574.5770935058594, "epoch": 0.833733397343575, "grad_norm": 0.27947208285331726, "kl": 0.570138244330883, "learning_rate": 1.6355813060221993e-06, "loss": 0.0999, "reward": 1.7619792222976685, "reward_std": 0.25141082108020785, "rewards/accuracy_reward": 0.0895833358168602, "rewards/format_reward": 0.9395833671092987, "rewards/tag_count_reward": 0.7328125238418579, "step": 2605 }, { "clip_ratio": 0.0, "completion_length": 605.933349609375, "epoch": 0.8340534485517683, "grad_norm": 0.1307932436466217, "kl": 0.2841352041810751, "learning_rate": 1.6294614239342764e-06, "loss": 0.0737, "reward": 1.7343750476837159, "reward_std": 0.17624344304203987, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7364583492279053, "step": 2606 }, { "clip_ratio": 0.0, "completion_length": 591.839599609375, "epoch": 0.8343734997599616, "grad_norm": 0.17136070132255554, "kl": 0.32739310935139654, "learning_rate": 1.6233519970238732e-06, "loss": 0.0843, "reward": 1.7619792222976685, "reward_std": 0.18518462628126145, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7348958551883698, "step": 2607 }, { "clip_ratio": 0.0, "completion_length": 562.8708557128906, "epoch": 0.8346935509681549, "grad_norm": 0.12200771272182465, "kl": 0.25093438662588596, "learning_rate": 1.6172530329219416e-06, "loss": 0.0834, "reward": 1.8307292222976685, "reward_std": 0.18550884053111077, "rewards/accuracy_reward": 0.12291667200624942, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7411458432674408, "step": 2608 }, { "clip_ratio": 0.0, "completion_length": 587.4666931152344, "epoch": 0.8350136021763482, "grad_norm": 0.18844038248062134, "kl": 0.3024763770401478, "learning_rate": 1.6111645392463548e-06, "loss": 0.071, "reward": 1.7739583611488343, "reward_std": 0.20886053442955016, "rewards/accuracy_reward": 0.06875000298023223, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7385416805744172, "step": 2609 }, { "clip_ratio": 0.0, "completion_length": 589.9791870117188, "epoch": 0.8353336533845416, "grad_norm": 0.21762879192829132, "kl": 0.36289220452308657, "learning_rate": 1.6050865236019165e-06, "loss": 0.0809, "reward": 1.7531250357627868, "reward_std": 0.27787193953990935, "rewards/accuracy_reward": 0.07916666679084301, "rewards/format_reward": 0.9395833492279053, "rewards/tag_count_reward": 0.7343750238418579, "step": 2610 }, { "clip_ratio": 0.0, "completion_length": 564.4062683105469, "epoch": 0.8356537045927348, "grad_norm": 0.2098568081855774, "kl": 0.3839412644505501, "learning_rate": 1.5990189935803402e-06, "loss": 0.1146, "reward": 1.7791666984558105, "reward_std": 0.24917776137590408, "rewards/accuracy_reward": 0.08750000037252903, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7375000178813934, "step": 2611 }, { "clip_ratio": 0.0, "completion_length": 605.5666931152343, "epoch": 0.8359737558009281, "grad_norm": 0.1561695784330368, "kl": 0.38026146665215493, "learning_rate": 1.5929619567602429e-06, "loss": 0.0763, "reward": 1.8197917103767396, "reward_std": 0.21357117891311644, "rewards/accuracy_reward": 0.1416666716337204, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7343750238418579, "step": 2612 }, { "clip_ratio": 0.0, "completion_length": 564.8062683105469, "epoch": 0.8362938070091215, "grad_norm": 0.13479575514793396, "kl": 0.1574801068753004, "learning_rate": 1.5869154207071347e-06, "loss": 0.0659, "reward": 1.8072916984558105, "reward_std": 0.20274921506643295, "rewards/accuracy_reward": 0.08958333749324084, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.746875011920929, "step": 2613 }, { "clip_ratio": 0.0, "completion_length": 578.0937713623047, "epoch": 0.8366138582173148, "grad_norm": 0.17284269630908966, "kl": 0.2832434602081776, "learning_rate": 1.58087939297341e-06, "loss": 0.0588, "reward": 1.8270833849906922, "reward_std": 0.15429509207606315, "rewards/accuracy_reward": 0.11666667014360428, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 2614 }, { "clip_ratio": 0.0, "completion_length": 581.4229309082032, "epoch": 0.836933909425508, "grad_norm": 0.09957928210496902, "kl": 0.12662966325879096, "learning_rate": 1.5748538810983382e-06, "loss": 0.0306, "reward": 1.9494792222976685, "reward_std": 0.14919039011001586, "rewards/accuracy_reward": 0.2250000074505806, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.745312511920929, "step": 2615 }, { "clip_ratio": 0.0, "completion_length": 604.9479309082031, "epoch": 0.8372539606337014, "grad_norm": 0.14037205278873444, "kl": 0.4093516394495964, "learning_rate": 1.5688388926080534e-06, "loss": 0.1201, "reward": 1.823958396911621, "reward_std": 0.24740922898054124, "rewards/accuracy_reward": 0.15208333786576986, "rewards/format_reward": 0.9395833492279053, "rewards/tag_count_reward": 0.7322916805744171, "step": 2616 }, { "clip_ratio": 0.0, "completion_length": 579.7166870117187, "epoch": 0.8375740118418947, "grad_norm": 0.3734375536441803, "kl": 0.2914412751793861, "learning_rate": 1.5628344350155477e-06, "loss": 0.1217, "reward": 1.7296875238418579, "reward_std": 0.23707970902323722, "rewards/accuracy_reward": 0.03958333358168602, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.735937523841858, "step": 2617 }, { "clip_ratio": 0.0, "completion_length": 586.3666900634765, "epoch": 0.8378940630500881, "grad_norm": 0.25126445293426514, "kl": 0.26317610368132593, "learning_rate": 1.5568405158206523e-06, "loss": 0.0849, "reward": 1.7848958730697633, "reward_std": 0.18527232334017754, "rewards/accuracy_reward": 0.08958333637565374, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7411458492279053, "step": 2618 }, { "clip_ratio": 0.0, "completion_length": 575.4645935058594, "epoch": 0.8382141142582813, "grad_norm": 0.3053106367588043, "kl": 0.44118655622005465, "learning_rate": 1.5508571425100428e-06, "loss": 0.0941, "reward": 1.7338541984558105, "reward_std": 0.26469497084617616, "rewards/accuracy_reward": 0.05625000149011612, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.735937523841858, "step": 2619 }, { "clip_ratio": 0.0, "completion_length": 559.1729370117188, "epoch": 0.8385341654664746, "grad_norm": 0.22060352563858032, "kl": 0.2865352720022202, "learning_rate": 1.5448843225572218e-06, "loss": 0.0707, "reward": 1.809375035762787, "reward_std": 0.16750450432300568, "rewards/accuracy_reward": 0.09791667014360428, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7427083432674408, "step": 2620 }, { "clip_ratio": 0.0, "completion_length": 568.0520935058594, "epoch": 0.838854216674668, "grad_norm": 0.33194923400878906, "kl": 0.26609497480094435, "learning_rate": 1.5389220634225077e-06, "loss": 0.057, "reward": 1.834375023841858, "reward_std": 0.25171951204538345, "rewards/accuracy_reward": 0.12708334028720855, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.740625011920929, "step": 2621 }, { "clip_ratio": 0.0, "completion_length": 571.0687774658203, "epoch": 0.8391742678828612, "grad_norm": 0.1432190239429474, "kl": 0.20767300575971603, "learning_rate": 1.5329703725530298e-06, "loss": 0.0666, "reward": 1.840625035762787, "reward_std": 0.20196708291769028, "rewards/accuracy_reward": 0.13750000465661288, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7447916865348816, "step": 2622 }, { "clip_ratio": 0.0, "completion_length": 583.5916870117187, "epoch": 0.8394943190910545, "grad_norm": 0.2663310468196869, "kl": 0.5039402447640896, "learning_rate": 1.5270292573827173e-06, "loss": 0.1465, "reward": 1.7687500596046448, "reward_std": 0.2818905636668205, "rewards/accuracy_reward": 0.09791666939854622, "rewards/format_reward": 0.9354166924953461, "rewards/tag_count_reward": 0.735416692495346, "step": 2623 }, { "clip_ratio": 0.0, "completion_length": 594.5104431152344, "epoch": 0.8398143702992479, "grad_norm": 0.14252537488937378, "kl": 0.2896796494722366, "learning_rate": 1.5210987253322862e-06, "loss": 0.0766, "reward": 1.7791666865348816, "reward_std": 0.19611259773373604, "rewards/accuracy_reward": 0.07500000111758709, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7416666865348815, "step": 2624 }, { "clip_ratio": 0.0, "completion_length": 547.9958465576171, "epoch": 0.8401344215074412, "grad_norm": 0.11551255732774734, "kl": 0.38152455165982246, "learning_rate": 1.5151787838092425e-06, "loss": 0.129, "reward": 1.7812500238418578, "reward_std": 0.21832403987646104, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.9416666746139526, "rewards/tag_count_reward": 0.7395833551883697, "step": 2625 }, { "clip_ratio": 0.0, "completion_length": 584.7604370117188, "epoch": 0.8404544727156344, "grad_norm": 0.22384434938430786, "kl": 0.23071169778704642, "learning_rate": 1.509269440207851e-06, "loss": 0.0816, "reward": 1.7817708849906921, "reward_std": 0.23716015443205835, "rewards/accuracy_reward": 0.08541667014360428, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7421875238418579, "step": 2626 }, { "clip_ratio": 0.0, "completion_length": 598.8812683105468, "epoch": 0.8407745239238278, "grad_norm": 0.1428438276052475, "kl": 0.3534207258373499, "learning_rate": 1.5033707019091503e-06, "loss": 0.0928, "reward": 1.7427083730697632, "reward_std": 0.259302493929863, "rewards/accuracy_reward": 0.06458333488553762, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7364583492279053, "step": 2627 }, { "clip_ratio": 0.0, "completion_length": 571.1895965576172, "epoch": 0.8410945751320211, "grad_norm": 0.17479349672794342, "kl": 0.30650137886404993, "learning_rate": 1.4974825762809275e-06, "loss": 0.1127, "reward": 1.81927090883255, "reward_std": 0.22232020273804665, "rewards/accuracy_reward": 0.12083333749324084, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7380208551883698, "step": 2628 }, { "clip_ratio": 0.0, "completion_length": 561.5375274658203, "epoch": 0.8414146263402145, "grad_norm": 0.14325033128261566, "kl": 0.27730308175086976, "learning_rate": 1.4916050706777185e-06, "loss": 0.1261, "reward": 1.8218750596046447, "reward_std": 0.2709692373871803, "rewards/accuracy_reward": 0.13333333637565375, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7364583551883698, "step": 2629 }, { "clip_ratio": 0.0, "completion_length": 587.1521057128906, "epoch": 0.8417346775484077, "grad_norm": 0.20191915333271027, "kl": 0.31308504939079285, "learning_rate": 1.4857381924407833e-06, "loss": 0.0824, "reward": 1.7651041984558105, "reward_std": 0.18237927034497262, "rewards/accuracy_reward": 0.0666666679084301, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7421875178813935, "step": 2630 }, { "clip_ratio": 0.0, "completion_length": 576.4229339599609, "epoch": 0.842054728756601, "grad_norm": 0.30419307947158813, "kl": 0.3622270733118057, "learning_rate": 1.4798819488981232e-06, "loss": 0.1275, "reward": 1.767708384990692, "reward_std": 0.24212229549884795, "rewards/accuracy_reward": 0.07500000335276127, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7447916865348816, "step": 2631 }, { "clip_ratio": 0.0, "completion_length": 610.1583557128906, "epoch": 0.8423747799647944, "grad_norm": 0.13989323377609253, "kl": 0.31397813037037847, "learning_rate": 1.474036347364446e-06, "loss": 0.1038, "reward": 1.6932291865348816, "reward_std": 0.18132396936416625, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.736979192495346, "step": 2632 }, { "clip_ratio": 0.0, "completion_length": 583.1500213623046, "epoch": 0.8426948311729877, "grad_norm": 0.20425206422805786, "kl": 0.49030707627534864, "learning_rate": 1.4682013951411723e-06, "loss": 0.1228, "reward": 1.751562523841858, "reward_std": 0.2534876331686974, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7328125178813935, "step": 2633 }, { "clip_ratio": 0.0, "completion_length": 584.2437622070313, "epoch": 0.8430148823811809, "grad_norm": 0.299966961145401, "kl": 0.4217537730932236, "learning_rate": 1.4623770995164133e-06, "loss": 0.0986, "reward": 1.8588541984558105, "reward_std": 0.25987871587276457, "rewards/accuracy_reward": 0.18125000670552255, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7317708492279053, "step": 2634 }, { "clip_ratio": 0.0, "completion_length": 556.0125335693359, "epoch": 0.8433349335893743, "grad_norm": 0.1416076272726059, "kl": 0.24743956252932547, "learning_rate": 1.4565634677649786e-06, "loss": 0.0984, "reward": 1.778645884990692, "reward_std": 0.2074045091867447, "rewards/accuracy_reward": 0.07708333488553762, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7390625238418579, "step": 2635 }, { "clip_ratio": 0.0, "completion_length": 553.6229309082031, "epoch": 0.8436549847975676, "grad_norm": 0.25936782360076904, "kl": 0.44153971374034884, "learning_rate": 1.4507605071483533e-06, "loss": 0.0864, "reward": 1.8010417342185974, "reward_std": 0.22055020928382874, "rewards/accuracy_reward": 0.10833333637565375, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7406250178813935, "step": 2636 }, { "clip_ratio": 0.0, "completion_length": 558.3500213623047, "epoch": 0.843975036005761, "grad_norm": 0.3049626350402832, "kl": 0.28074306845664976, "learning_rate": 1.4449682249146957e-06, "loss": 0.1032, "reward": 1.8630208730697633, "reward_std": 0.2228449009358883, "rewards/accuracy_reward": 0.1708333395421505, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7359375119209289, "step": 2637 }, { "clip_ratio": 0.0, "completion_length": 580.5000274658203, "epoch": 0.8442950872139542, "grad_norm": 0.49720048904418945, "kl": 0.5407494202256202, "learning_rate": 1.4391866282988266e-06, "loss": 0.0996, "reward": 1.7453125238418579, "reward_std": 0.23839708790183067, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7307291805744172, "step": 2638 }, { "clip_ratio": 0.0, "completion_length": 560.0021057128906, "epoch": 0.8446151384221475, "grad_norm": 0.15758629143238068, "kl": 0.3089794680476189, "learning_rate": 1.433415724522218e-06, "loss": 0.0945, "reward": 1.8218750357627869, "reward_std": 0.2678198732435703, "rewards/accuracy_reward": 0.1270833369344473, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583551883698, "step": 2639 }, { "clip_ratio": 0.0, "completion_length": 570.2416778564453, "epoch": 0.8449351896303409, "grad_norm": 0.20118384063243866, "kl": 0.5578947067260742, "learning_rate": 1.4276555207929864e-06, "loss": 0.1285, "reward": 1.7604166865348816, "reward_std": 0.2577348858118057, "rewards/accuracy_reward": 0.08333333432674409, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.729166692495346, "step": 2640 }, { "clip_ratio": 0.0, "completion_length": 549.5604309082031, "epoch": 0.8452552408385342, "grad_norm": 0.23886068165302277, "kl": 0.4678042992949486, "learning_rate": 1.4219060243058879e-06, "loss": 0.1033, "reward": 1.8104166984558105, "reward_std": 0.20677258148789407, "rewards/accuracy_reward": 0.10416667014360428, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7416666805744171, "step": 2641 }, { "clip_ratio": 0.0, "completion_length": 573.9916870117188, "epoch": 0.8455752920467274, "grad_norm": 0.17162956297397614, "kl": 0.31468153595924375, "learning_rate": 1.4161672422422968e-06, "loss": 0.0841, "reward": 1.7854166865348815, "reward_std": 0.15550397485494613, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7416666865348815, "step": 2642 }, { "clip_ratio": 0.0, "completion_length": 542.9687591552735, "epoch": 0.8458953432549208, "grad_norm": 0.37330108880996704, "kl": 0.3181822635233402, "learning_rate": 1.410439181770209e-06, "loss": 0.0937, "reward": 1.880208384990692, "reward_std": 0.23388459980487825, "rewards/accuracy_reward": 0.18541667312383653, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583611488342, "step": 2643 }, { "clip_ratio": 0.0, "completion_length": 565.7812591552735, "epoch": 0.8462153944631141, "grad_norm": 0.2929092347621918, "kl": 0.3507498770952225, "learning_rate": 1.4047218500442305e-06, "loss": 0.0923, "reward": 1.7640625357627868, "reward_std": 0.22633379325270653, "rewards/accuracy_reward": 0.07916666977107525, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7369791865348816, "step": 2644 }, { "clip_ratio": 0.0, "completion_length": 554.1791809082031, "epoch": 0.8465354456713075, "grad_norm": 0.19493304193019867, "kl": 0.2631402283906937, "learning_rate": 1.3990152542055647e-06, "loss": 0.0892, "reward": 1.692187535762787, "reward_std": 0.20709386914968492, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7380208551883698, "step": 2645 }, { "clip_ratio": 0.0, "completion_length": 574.1312652587891, "epoch": 0.8468554968795007, "grad_norm": 0.1697675734758377, "kl": 0.3390098616480827, "learning_rate": 1.3933194013820038e-06, "loss": 0.0811, "reward": 1.8270833969116211, "reward_std": 0.21904802471399307, "rewards/accuracy_reward": 0.1354166718199849, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7437500119209289, "step": 2646 }, { "clip_ratio": 0.0, "completion_length": 569.6916870117187, "epoch": 0.847175548087694, "grad_norm": 0.18587954342365265, "kl": 0.34423902481794355, "learning_rate": 1.3876342986879243e-06, "loss": 0.0793, "reward": 1.7244791984558105, "reward_std": 0.23162373006343842, "rewards/accuracy_reward": 0.03541666828095913, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7348958551883698, "step": 2647 }, { "clip_ratio": 0.0, "completion_length": 585.3125244140625, "epoch": 0.8474955992958874, "grad_norm": 0.22119919955730438, "kl": 0.3237848818302155, "learning_rate": 1.3819599532242733e-06, "loss": 0.0992, "reward": 1.7864583611488343, "reward_std": 0.2305359125137329, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.9562500298023224, "rewards/tag_count_reward": 0.7385416924953461, "step": 2648 }, { "clip_ratio": 0.0, "completion_length": 599.4437683105468, "epoch": 0.8478156505040807, "grad_norm": 0.15753722190856934, "kl": 0.36308416426181794, "learning_rate": 1.3762963720785638e-06, "loss": 0.0961, "reward": 1.8088542103767395, "reward_std": 0.2562564447522163, "rewards/accuracy_reward": 0.10208333674818278, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7421875119209289, "step": 2649 }, { "clip_ratio": 0.0, "completion_length": 563.2937591552734, "epoch": 0.8481357017122739, "grad_norm": 0.2265545278787613, "kl": 0.4071022719144821, "learning_rate": 1.3706435623248627e-06, "loss": 0.1261, "reward": 1.817187535762787, "reward_std": 0.2956122875213623, "rewards/accuracy_reward": 0.1479166718199849, "rewards/format_reward": 0.9354166805744171, "rewards/tag_count_reward": 0.7338541865348815, "step": 2650 }, { "clip_ratio": 0.0, "completion_length": 578.441683959961, "epoch": 0.8484557529204673, "grad_norm": 0.2043626606464386, "kl": 0.3923542708158493, "learning_rate": 1.3650015310237796e-06, "loss": 0.1057, "reward": 1.741145873069763, "reward_std": 0.19232798367738724, "rewards/accuracy_reward": 0.043750002048909664, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7369791865348816, "step": 2651 }, { "clip_ratio": 0.0, "completion_length": 556.8833648681641, "epoch": 0.8487758041286606, "grad_norm": 0.17744043469429016, "kl": 0.2601847030222416, "learning_rate": 1.3593702852224655e-06, "loss": 0.0939, "reward": 1.8307292222976685, "reward_std": 0.18132452741265298, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7390625178813934, "step": 2652 }, { "clip_ratio": 0.0, "completion_length": 573.8666809082031, "epoch": 0.8490958553368539, "grad_norm": 0.1844325214624405, "kl": 0.2797363385558128, "learning_rate": 1.3537498319545984e-06, "loss": 0.0685, "reward": 1.8364583730697632, "reward_std": 0.2018692083656788, "rewards/accuracy_reward": 0.13125000279396773, "rewards/format_reward": 0.9666666746139526, "rewards/tag_count_reward": 0.7385416805744172, "step": 2653 }, { "clip_ratio": 0.0, "completion_length": 556.9500152587891, "epoch": 0.8494159065450472, "grad_norm": 0.2170618176460266, "kl": 0.23281632959842682, "learning_rate": 1.3481401782403792e-06, "loss": 0.0911, "reward": 1.7765625357627868, "reward_std": 0.1638067312538624, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7411458492279053, "step": 2654 }, { "clip_ratio": 0.0, "completion_length": 541.6770935058594, "epoch": 0.8497359577532405, "grad_norm": 0.1635342389345169, "kl": 0.46017926633358003, "learning_rate": 1.3425413310865087e-06, "loss": 0.1198, "reward": 1.8062500357627869, "reward_std": 0.22490473836660385, "rewards/accuracy_reward": 0.11250000409781932, "rewards/format_reward": 0.9583333432674408, "rewards/tag_count_reward": 0.7354166805744171, "step": 2655 }, { "clip_ratio": 0.0, "completion_length": 580.7958526611328, "epoch": 0.8500560089614339, "grad_norm": 0.1633990854024887, "kl": 0.3503039345145226, "learning_rate": 1.3369532974862053e-06, "loss": 0.0993, "reward": 1.7932292103767395, "reward_std": 0.2544810831546783, "rewards/accuracy_reward": 0.11041666828095913, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7390625238418579, "step": 2656 }, { "clip_ratio": 0.0, "completion_length": 551.1583557128906, "epoch": 0.8503760601696272, "grad_norm": 0.15160250663757324, "kl": 0.20180488303303717, "learning_rate": 1.3313760844191713e-06, "loss": 0.0956, "reward": 1.7343750238418578, "reward_std": 0.20577147156000136, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7364583551883698, "step": 2657 }, { "clip_ratio": 0.0, "completion_length": 594.3541931152344, "epoch": 0.8506961113778204, "grad_norm": 0.12937583029270172, "kl": 0.546181932091713, "learning_rate": 1.325809698851598e-06, "loss": 0.0912, "reward": 1.7593750357627869, "reward_std": 0.25953795313835143, "rewards/accuracy_reward": 0.08125000260770321, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7322916984558105, "step": 2658 }, { "clip_ratio": 0.0, "completion_length": 541.1250152587891, "epoch": 0.8510161625860138, "grad_norm": 0.13249269127845764, "kl": 0.2853389322757721, "learning_rate": 1.3202541477361441e-06, "loss": 0.0726, "reward": 1.8062500715255738, "reward_std": 0.1834932841360569, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7354166805744171, "step": 2659 }, { "clip_ratio": 0.0, "completion_length": 560.6979400634766, "epoch": 0.8513362137942071, "grad_norm": 0.16687384247779846, "kl": 0.3180491279810667, "learning_rate": 1.314709438011945e-06, "loss": 0.1088, "reward": 1.7786458611488343, "reward_std": 0.24222037941217422, "rewards/accuracy_reward": 0.09791667088866234, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7390625178813934, "step": 2660 }, { "clip_ratio": 0.0, "completion_length": 550.1083557128907, "epoch": 0.8516562650024004, "grad_norm": 0.14712361991405487, "kl": 0.30614238381385805, "learning_rate": 1.3091755766045922e-06, "loss": 0.0958, "reward": 1.9145833611488343, "reward_std": 0.21078643649816514, "rewards/accuracy_reward": 0.21458333935588597, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7354166865348816, "step": 2661 }, { "clip_ratio": 0.0, "completion_length": 535.0333526611328, "epoch": 0.8519763162105937, "grad_norm": 0.11870875954627991, "kl": 0.3040292389690876, "learning_rate": 1.303652570426125e-06, "loss": 0.0715, "reward": 1.7276041865348817, "reward_std": 0.1599747955799103, "rewards/accuracy_reward": 0.0125, "rewards/format_reward": 0.9750000238418579, "rewards/tag_count_reward": 0.7401041865348816, "step": 2662 }, { "clip_ratio": 0.0, "completion_length": 569.4250213623047, "epoch": 0.852296367418787, "grad_norm": 0.3058604896068573, "kl": 0.3141802024096251, "learning_rate": 1.2981404263750264e-06, "loss": 0.078, "reward": 1.7994792103767394, "reward_std": 0.20162120461463928, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7390625178813934, "step": 2663 }, { "clip_ratio": 0.0, "completion_length": 563.1250213623047, "epoch": 0.8526164186269803, "grad_norm": 0.18708136677742004, "kl": 0.359296178817749, "learning_rate": 1.2926391513362102e-06, "loss": 0.1079, "reward": 1.7489583611488342, "reward_std": 0.24096645563840866, "rewards/accuracy_reward": 0.052083334885537624, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7364583492279053, "step": 2664 }, { "clip_ratio": 0.0, "completion_length": 574.0354309082031, "epoch": 0.8529364698351737, "grad_norm": 0.22880977392196655, "kl": 0.4187725305557251, "learning_rate": 1.2871487521810166e-06, "loss": 0.1185, "reward": 1.8052083730697632, "reward_std": 0.28198549449443816, "rewards/accuracy_reward": 0.12708333767950536, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7364583551883698, "step": 2665 }, { "clip_ratio": 0.0, "completion_length": 564.4416809082031, "epoch": 0.8532565210433669, "grad_norm": 0.1381569802761078, "kl": 0.28381815254688264, "learning_rate": 1.2816692357672012e-06, "loss": 0.0855, "reward": 1.794270884990692, "reward_std": 0.21319209337234496, "rewards/accuracy_reward": 0.10833333861082792, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7359375119209289, "step": 2666 }, { "clip_ratio": 0.0, "completion_length": 575.3875244140625, "epoch": 0.8535765722515603, "grad_norm": 0.15339335799217224, "kl": 0.3478463143110275, "learning_rate": 1.2762006089389212e-06, "loss": 0.0999, "reward": 1.7197917103767395, "reward_std": 0.22356971949338914, "rewards/accuracy_reward": 0.02708333358168602, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7385416865348816, "step": 2667 }, { "clip_ratio": 0.0, "completion_length": 570.0375183105468, "epoch": 0.8538966234597536, "grad_norm": 0.27163901925086975, "kl": 0.42675180844962596, "learning_rate": 1.2707428785267396e-06, "loss": 0.0928, "reward": 1.8291667342185973, "reward_std": 0.20625257939100267, "rewards/accuracy_reward": 0.1416666701436043, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7375000178813934, "step": 2668 }, { "clip_ratio": 0.0, "completion_length": 541.3541870117188, "epoch": 0.8542166746679468, "grad_norm": 0.25037550926208496, "kl": 0.23697092048823834, "learning_rate": 1.2652960513476043e-06, "loss": 0.0653, "reward": 1.7333333492279053, "reward_std": 0.13338673561811448, "rewards/accuracy_reward": 0.01666666679084301, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7416666865348815, "step": 2669 }, { "clip_ratio": 0.0, "completion_length": 565.9979370117187, "epoch": 0.8545367258761402, "grad_norm": 0.23881444334983826, "kl": 0.42256051301956177, "learning_rate": 1.2598601342048477e-06, "loss": 0.1146, "reward": 1.8848958611488342, "reward_std": 0.29029052406549455, "rewards/accuracy_reward": 0.19375000428408384, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7348958551883698, "step": 2670 }, { "clip_ratio": 0.0, "completion_length": 546.243765258789, "epoch": 0.8548567770843335, "grad_norm": 0.09502812474966049, "kl": 0.18763545230031015, "learning_rate": 1.2544351338881721e-06, "loss": 0.0846, "reward": 1.7942708492279054, "reward_std": 0.15935472398996353, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7421875178813935, "step": 2671 }, { "clip_ratio": 0.0, "completion_length": 585.3312622070313, "epoch": 0.8551768282925268, "grad_norm": 0.17005877196788788, "kl": 0.4221621580421925, "learning_rate": 1.2490210571736484e-06, "loss": 0.1006, "reward": 1.7765625357627868, "reward_std": 0.22101174741983415, "rewards/accuracy_reward": 0.08541666697710752, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7348958492279053, "step": 2672 }, { "clip_ratio": 0.0, "completion_length": 577.2625183105469, "epoch": 0.8554968795007201, "grad_norm": 0.241080179810524, "kl": 0.3817593351006508, "learning_rate": 1.2436179108236989e-06, "loss": 0.0917, "reward": 1.775000023841858, "reward_std": 0.2237080730497837, "rewards/accuracy_reward": 0.08750000167638064, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7354166805744171, "step": 2673 }, { "clip_ratio": 0.0, "completion_length": 572.0166870117188, "epoch": 0.8558169307089134, "grad_norm": 0.17033007740974426, "kl": 0.3994890958070755, "learning_rate": 1.2382257015870957e-06, "loss": 0.1237, "reward": 1.746875035762787, "reward_std": 0.24429445043206216, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7322916865348816, "step": 2674 }, { "clip_ratio": 0.0, "completion_length": 547.133349609375, "epoch": 0.8561369819171067, "grad_norm": 0.11920144408941269, "kl": 0.25182824283838273, "learning_rate": 1.2328444361989523e-06, "loss": 0.07, "reward": 1.7604166865348816, "reward_std": 0.18208148702979088, "rewards/accuracy_reward": 0.05416666697710752, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7395833492279053, "step": 2675 }, { "clip_ratio": 0.0, "completion_length": 579.2958465576172, "epoch": 0.8564570331253001, "grad_norm": 0.1800604909658432, "kl": 0.3815006874501705, "learning_rate": 1.227474121380705e-06, "loss": 0.104, "reward": 1.7385417103767395, "reward_std": 0.23486847579479217, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7364583492279053, "step": 2676 }, { "clip_ratio": 0.0, "completion_length": 563.7083526611328, "epoch": 0.8567770843334933, "grad_norm": 0.19448384642601013, "kl": 0.38638448938727377, "learning_rate": 1.222114763840121e-06, "loss": 0.1007, "reward": 1.8333333730697632, "reward_std": 0.19629458263516425, "rewards/accuracy_reward": 0.1395833384245634, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.735416692495346, "step": 2677 }, { "clip_ratio": 0.0, "completion_length": 570.2854370117187, "epoch": 0.8570971355416866, "grad_norm": 0.15441475808620453, "kl": 0.32658002004027364, "learning_rate": 1.2167663702712773e-06, "loss": 0.1138, "reward": 1.7317708730697632, "reward_std": 0.18842825740575792, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7380208551883698, "step": 2678 }, { "clip_ratio": 0.0, "completion_length": 574.039599609375, "epoch": 0.85741718674988, "grad_norm": 0.19493429362773895, "kl": 0.5683829590678215, "learning_rate": 1.2114289473545583e-06, "loss": 0.1266, "reward": 1.7197916865348817, "reward_std": 0.2377609834074974, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9520833611488342, "rewards/tag_count_reward": 0.7281250178813934, "step": 2679 }, { "clip_ratio": 0.0, "completion_length": 551.0000183105469, "epoch": 0.8577372379580733, "grad_norm": 0.13462968170642853, "kl": 0.22133766189217569, "learning_rate": 1.2061025017566374e-06, "loss": 0.051, "reward": 1.7822917103767395, "reward_std": 0.16446125581860543, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7447916865348816, "step": 2680 }, { "clip_ratio": 0.0, "completion_length": 593.8562622070312, "epoch": 0.8580572891662666, "grad_norm": 0.2179809808731079, "kl": 0.30674128159880637, "learning_rate": 1.2007870401304922e-06, "loss": 0.0736, "reward": 1.8442708849906921, "reward_std": 0.2419275127351284, "rewards/accuracy_reward": 0.1500000072643161, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7338541865348815, "step": 2681 }, { "clip_ratio": 0.0, "completion_length": 580.5625183105469, "epoch": 0.8583773403744599, "grad_norm": 0.18204998970031738, "kl": 0.29479278065264225, "learning_rate": 1.1954825691153682e-06, "loss": 0.0899, "reward": 1.7348958611488343, "reward_std": 0.15623627305030824, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7432291805744171, "step": 2682 }, { "clip_ratio": 0.0, "completion_length": 566.5187744140625, "epoch": 0.8586973915826532, "grad_norm": 0.18926359713077545, "kl": 0.34724462777376175, "learning_rate": 1.190189095336791e-06, "loss": 0.1008, "reward": 1.8578125596046449, "reward_std": 0.2738295793533325, "rewards/accuracy_reward": 0.1708333373069763, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7328125178813935, "step": 2683 }, { "clip_ratio": 0.0, "completion_length": 569.8229309082031, "epoch": 0.8590174427908466, "grad_norm": 0.09490280598402023, "kl": 0.1771955456584692, "learning_rate": 1.1849066254065412e-06, "loss": 0.052, "reward": 1.7838541984558105, "reward_std": 0.1423393502831459, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7421875238418579, "step": 2684 }, { "clip_ratio": 0.0, "completion_length": 578.9875122070313, "epoch": 0.8593374939990398, "grad_norm": 0.18486838042736053, "kl": 0.38636996001005175, "learning_rate": 1.1796351659226623e-06, "loss": 0.1157, "reward": 1.8244792222976685, "reward_std": 0.2464168481528759, "rewards/accuracy_reward": 0.13541667107492686, "rewards/format_reward": 0.9541666746139527, "rewards/tag_count_reward": 0.7348958492279053, "step": 2685 }, { "clip_ratio": 0.0, "completion_length": 572.1375183105469, "epoch": 0.8596575452072331, "grad_norm": 0.1398511826992035, "kl": 0.3162212152034044, "learning_rate": 1.1743747234694437e-06, "loss": 0.0875, "reward": 1.8901041865348815, "reward_std": 0.18897883892059325, "rewards/accuracy_reward": 0.18750000298023223, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7401041865348816, "step": 2686 }, { "clip_ratio": 0.0, "completion_length": 570.7896057128906, "epoch": 0.8599775964154265, "grad_norm": 0.23524326086044312, "kl": 0.34184712544083595, "learning_rate": 1.1691253046174144e-06, "loss": 0.0877, "reward": 1.7583333849906921, "reward_std": 0.19846115112304688, "rewards/accuracy_reward": 0.05833333544433117, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7354166865348816, "step": 2687 }, { "clip_ratio": 0.0, "completion_length": 545.3125244140625, "epoch": 0.8602976476236198, "grad_norm": 0.3231591284275055, "kl": 0.20597450919449328, "learning_rate": 1.1638869159233301e-06, "loss": 0.0747, "reward": 1.7786458611488343, "reward_std": 0.1821716882288456, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7432291865348816, "step": 2688 }, { "clip_ratio": 0.0, "completion_length": 531.2187683105469, "epoch": 0.860617698831813, "grad_norm": 0.17155896127223969, "kl": 0.30113087967038155, "learning_rate": 1.1586595639301768e-06, "loss": 0.06, "reward": 1.7901041746139525, "reward_std": 0.2241733819246292, "rewards/accuracy_reward": 0.08541666772216558, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7421875238418579, "step": 2689 }, { "clip_ratio": 0.0, "completion_length": 573.2312713623047, "epoch": 0.8609377500400064, "grad_norm": 0.20491039752960205, "kl": 0.4510466232895851, "learning_rate": 1.1534432551671492e-06, "loss": 0.0927, "reward": 1.7369791865348816, "reward_std": 0.21386837363243102, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7348958432674408, "step": 2690 }, { "clip_ratio": 0.0, "completion_length": 575.8146118164062, "epoch": 0.8612578012481997, "grad_norm": 0.09562389552593231, "kl": 0.27018130123615264, "learning_rate": 1.1482379961496536e-06, "loss": 0.0838, "reward": 1.7692708730697633, "reward_std": 0.20710121542215348, "rewards/accuracy_reward": 0.07291667070239782, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7380208492279052, "step": 2691 }, { "clip_ratio": 0.0, "completion_length": 534.3854400634766, "epoch": 0.8615778524563931, "grad_norm": 0.28722503781318665, "kl": 0.22337874136865138, "learning_rate": 1.143043793379287e-06, "loss": 0.082, "reward": 1.789583396911621, "reward_std": 0.19062369912862778, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.743750023841858, "step": 2692 }, { "clip_ratio": 0.0, "completion_length": 566.3979339599609, "epoch": 0.8618979036645863, "grad_norm": 0.17179274559020996, "kl": 0.27019251585006715, "learning_rate": 1.1378606533438442e-06, "loss": 0.073, "reward": 1.7218750238418579, "reward_std": 0.22592300027608872, "rewards/accuracy_reward": 0.0354166679084301, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7322916924953461, "step": 2693 }, { "clip_ratio": 0.0, "completion_length": 555.6229370117187, "epoch": 0.8622179548727796, "grad_norm": 0.16243615746498108, "kl": 0.25600522831082345, "learning_rate": 1.1326885825172973e-06, "loss": 0.0854, "reward": 1.8052083730697632, "reward_std": 0.20433509722352028, "rewards/accuracy_reward": 0.10625000223517418, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7364583492279053, "step": 2694 }, { "clip_ratio": 0.0, "completion_length": 539.5146057128907, "epoch": 0.862538006080973, "grad_norm": 0.2923242449760437, "kl": 0.3979221750050783, "learning_rate": 1.1275275873597957e-06, "loss": 0.1161, "reward": 1.7541667103767395, "reward_std": 0.21354512870311737, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7354166805744171, "step": 2695 }, { "clip_ratio": 0.0, "completion_length": 549.702099609375, "epoch": 0.8628580572891663, "grad_norm": 0.16917048394680023, "kl": 0.3872626259922981, "learning_rate": 1.122377674317653e-06, "loss": 0.115, "reward": 1.7744791865348817, "reward_std": 0.21435921788215637, "rewards/accuracy_reward": 0.08958333618938923, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7307291805744172, "step": 2696 }, { "clip_ratio": 0.0, "completion_length": 557.083349609375, "epoch": 0.8631781084973595, "grad_norm": 0.18929900228977203, "kl": 0.33370514437556265, "learning_rate": 1.1172388498233421e-06, "loss": 0.0709, "reward": 1.7781250119209289, "reward_std": 0.1962820142507553, "rewards/accuracy_reward": 0.07708333488553762, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7385416865348816, "step": 2697 }, { "clip_ratio": 0.0, "completion_length": 564.308349609375, "epoch": 0.8634981597055529, "grad_norm": 0.09958707541227341, "kl": 0.24256644695997237, "learning_rate": 1.1121111202954836e-06, "loss": 0.0797, "reward": 1.7947916984558105, "reward_std": 0.21337411254644395, "rewards/accuracy_reward": 0.0958333345130086, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7385416865348816, "step": 2698 }, { "clip_ratio": 0.0, "completion_length": 550.0083557128906, "epoch": 0.8638182109137462, "grad_norm": 0.21212667226791382, "kl": 0.4391674891114235, "learning_rate": 1.1069944921388442e-06, "loss": 0.0906, "reward": 1.7661458849906921, "reward_std": 0.22532435059547423, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7328125178813935, "step": 2699 }, { "clip_ratio": 0.0, "completion_length": 574.220849609375, "epoch": 0.8641382621219396, "grad_norm": 0.2691134214401245, "kl": 0.30899880900979043, "learning_rate": 1.1018889717443182e-06, "loss": 0.0998, "reward": 1.7416666865348815, "reward_std": 0.23400208950042725, "rewards/accuracy_reward": 0.04791666772216559, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7354166805744171, "step": 2700 }, { "clip_ratio": 0.0, "completion_length": 569.5500244140625, "epoch": 0.8644583133301328, "grad_norm": 0.20773187279701233, "kl": 0.4713391542434692, "learning_rate": 1.096794565488929e-06, "loss": 0.1375, "reward": 1.7796875476837157, "reward_std": 0.306690426170826, "rewards/accuracy_reward": 0.10208333861082793, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7296875178813934, "step": 2701 }, { "clip_ratio": 0.0, "completion_length": 549.7541870117187, "epoch": 0.8647783645383261, "grad_norm": 0.30383846163749695, "kl": 0.27798029854893685, "learning_rate": 1.0917112797358199e-06, "loss": 0.0984, "reward": 1.8197917222976685, "reward_std": 0.2308654323220253, "rewards/accuracy_reward": 0.12708333637565375, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7364583492279053, "step": 2702 }, { "clip_ratio": 0.0, "completion_length": 562.6500183105469, "epoch": 0.8650984157465195, "grad_norm": 0.12061861157417297, "kl": 0.23864154443144797, "learning_rate": 1.086639120834243e-06, "loss": 0.0941, "reward": 1.7442708730697631, "reward_std": 0.1674013689160347, "rewards/accuracy_reward": 0.03958333432674408, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7380208551883698, "step": 2703 }, { "clip_ratio": 0.0, "completion_length": 583.1708557128907, "epoch": 0.8654184669547128, "grad_norm": 0.21024452149868011, "kl": 0.41636669635772705, "learning_rate": 1.0815780951195521e-06, "loss": 0.1121, "reward": 1.7630208730697632, "reward_std": 0.2516354911029339, "rewards/accuracy_reward": 0.09166666977107525, "rewards/format_reward": 0.9395833551883698, "rewards/tag_count_reward": 0.7317708492279053, "step": 2704 }, { "clip_ratio": 0.0, "completion_length": 567.3541809082031, "epoch": 0.865738518162906, "grad_norm": 0.27876242995262146, "kl": 0.3096645545214415, "learning_rate": 1.076528208913189e-06, "loss": 0.1002, "reward": 1.8083333611488341, "reward_std": 0.20803507119417192, "rewards/accuracy_reward": 0.10625000149011612, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7416666865348815, "step": 2705 }, { "clip_ratio": 0.0, "completion_length": 569.6625122070312, "epoch": 0.8660585693710994, "grad_norm": 0.19789564609527588, "kl": 0.2858868185430765, "learning_rate": 1.0714894685226961e-06, "loss": 0.1107, "reward": 1.7984375476837158, "reward_std": 0.23042803555727004, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.9583333432674408, "rewards/tag_count_reward": 0.7401041865348816, "step": 2706 }, { "clip_ratio": 0.0, "completion_length": 549.8604309082032, "epoch": 0.8663786205792927, "grad_norm": 0.12375348061323166, "kl": 0.3638369083404541, "learning_rate": 1.0664618802416814e-06, "loss": 0.138, "reward": 1.7260416984558105, "reward_std": 0.24393313825130464, "rewards/accuracy_reward": 0.05000000204890966, "rewards/format_reward": 0.9395833492279053, "rewards/tag_count_reward": 0.7364583551883698, "step": 2707 }, { "clip_ratio": 0.0, "completion_length": 563.0250183105469, "epoch": 0.866698671787486, "grad_norm": 0.18104706704616547, "kl": 0.30177239924669264, "learning_rate": 1.0614454503498306e-06, "loss": 0.0829, "reward": 1.7479166865348816, "reward_std": 0.1778070405125618, "rewards/accuracy_reward": 0.04166666772216558, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7395833551883697, "step": 2708 }, { "clip_ratio": 0.0, "completion_length": 558.0500091552734, "epoch": 0.8670187229956793, "grad_norm": 0.15156742930412292, "kl": 0.36322931200265884, "learning_rate": 1.0564401851128846e-06, "loss": 0.1007, "reward": 1.7916667222976685, "reward_std": 0.22478875443339347, "rewards/accuracy_reward": 0.09583333600312471, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.737500011920929, "step": 2709 }, { "clip_ratio": 0.0, "completion_length": 568.9916809082031, "epoch": 0.8673387742038726, "grad_norm": 0.24718128144741058, "kl": 0.3891347452998161, "learning_rate": 1.0514460907826473e-06, "loss": 0.0986, "reward": 1.8010417103767395, "reward_std": 0.23830287754535676, "rewards/accuracy_reward": 0.10416666902601719, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7385416924953461, "step": 2710 }, { "clip_ratio": 0.0, "completion_length": 560.3625244140625, "epoch": 0.867658825412066, "grad_norm": 0.2284335196018219, "kl": 0.367191506177187, "learning_rate": 1.0464631735969655e-06, "loss": 0.1262, "reward": 1.7213541984558105, "reward_std": 0.26351067125797273, "rewards/accuracy_reward": 0.03750000074505806, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7338541865348815, "step": 2711 }, { "clip_ratio": 0.0, "completion_length": 559.2896057128906, "epoch": 0.8679788766202592, "grad_norm": 0.132751002907753, "kl": 0.20918865650892257, "learning_rate": 1.0414914397797271e-06, "loss": 0.063, "reward": 1.8729167222976684, "reward_std": 0.21915601789951325, "rewards/accuracy_reward": 0.16250000651925803, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7458333551883698, "step": 2712 }, { "clip_ratio": 0.0, "completion_length": 558.395849609375, "epoch": 0.8682989278284525, "grad_norm": 0.12675440311431885, "kl": 0.3191126808524132, "learning_rate": 1.0365308955408459e-06, "loss": 0.1096, "reward": 1.7890625476837159, "reward_std": 0.24897948130965233, "rewards/accuracy_reward": 0.10000000409781933, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7390625178813934, "step": 2713 }, { "clip_ratio": 0.0, "completion_length": 576.3437713623047, "epoch": 0.8686189790366459, "grad_norm": 0.13551534712314606, "kl": 0.24030449092388154, "learning_rate": 1.031581547076268e-06, "loss": 0.098, "reward": 1.746875035762787, "reward_std": 0.16820005923509598, "rewards/accuracy_reward": 0.04791666865348816, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7385416805744172, "step": 2714 }, { "clip_ratio": 0.0, "completion_length": 576.8666809082031, "epoch": 0.8689390302448392, "grad_norm": 0.39579689502716064, "kl": 0.38164796978235244, "learning_rate": 1.0266434005679503e-06, "loss": 0.0777, "reward": 1.7369792103767394, "reward_std": 0.16624386757612228, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7369791805744171, "step": 2715 }, { "clip_ratio": 0.0, "completion_length": 574.0937683105469, "epoch": 0.8692590814530324, "grad_norm": 0.21584556996822357, "kl": 0.36943108662962915, "learning_rate": 1.0217164621838605e-06, "loss": 0.1115, "reward": 1.833333396911621, "reward_std": 0.24086197540163995, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7375000238418579, "step": 2716 }, { "clip_ratio": 0.0, "completion_length": 563.5812744140625, "epoch": 0.8695791326612258, "grad_norm": 0.21526369452476501, "kl": 0.30623162984848024, "learning_rate": 1.016800738077962e-06, "loss": 0.0955, "reward": 1.6947916984558105, "reward_std": 0.20032616332173347, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7364583551883698, "step": 2717 }, { "clip_ratio": 0.0, "completion_length": 543.6250244140625, "epoch": 0.8698991838694191, "grad_norm": 0.1324380338191986, "kl": 0.5877346590161323, "learning_rate": 1.011896234390215e-06, "loss": 0.1509, "reward": 1.7822916984558106, "reward_std": 0.2619222469627857, "rewards/accuracy_reward": 0.09791666679084302, "rewards/format_reward": 0.9520833432674408, "rewards/tag_count_reward": 0.7322916805744171, "step": 2718 }, { "clip_ratio": 0.0, "completion_length": 564.3979431152344, "epoch": 0.8702192350776125, "grad_norm": 0.13195985555648804, "kl": 0.18522481769323348, "learning_rate": 1.0070029572465657e-06, "loss": 0.0962, "reward": 1.8500000715255738, "reward_std": 0.18061546236276627, "rewards/accuracy_reward": 0.14375000428408385, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7416666805744171, "step": 2719 }, { "clip_ratio": 0.0, "completion_length": 586.8250183105469, "epoch": 0.8705392862858057, "grad_norm": 0.17610874772071838, "kl": 0.3283210381865501, "learning_rate": 1.002120912758935e-06, "loss": 0.1201, "reward": 1.725520873069763, "reward_std": 0.26378956735134124, "rewards/accuracy_reward": 0.04375000149011612, "rewards/format_reward": 0.9416666924953461, "rewards/tag_count_reward": 0.7401041805744171, "step": 2720 }, { "clip_ratio": 0.0, "completion_length": 546.0896057128906, "epoch": 0.870859337493999, "grad_norm": 0.1175459697842598, "kl": 0.3252341076731682, "learning_rate": 9.97250107025216e-07, "loss": 0.0661, "reward": 1.8026041984558105, "reward_std": 0.1510702796280384, "rewards/accuracy_reward": 0.0895833358168602, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7401041805744171, "step": 2721 }, { "clip_ratio": 0.0, "completion_length": 563.7833587646485, "epoch": 0.8711793887021924, "grad_norm": 0.08211004734039307, "kl": 0.2834597870707512, "learning_rate": 9.923905461292638e-07, "loss": 0.0908, "reward": 1.8187500357627868, "reward_std": 0.1927901290357113, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.743750023841858, "step": 2722 }, { "clip_ratio": 0.0, "completion_length": 568.2708557128906, "epoch": 0.8714994399103857, "grad_norm": 0.11005070060491562, "kl": 0.3537232682108879, "learning_rate": 9.87542236140886e-07, "loss": 0.0729, "reward": 1.7796875476837157, "reward_std": 0.18082961216568946, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7380208492279052, "step": 2723 }, { "clip_ratio": 0.0, "completion_length": 560.6145935058594, "epoch": 0.8718194911185789, "grad_norm": 0.13142850995063782, "kl": 0.26408388651907444, "learning_rate": 9.82705183115842e-07, "loss": 0.1009, "reward": 1.7942708611488343, "reward_std": 0.19019991308450698, "rewards/accuracy_reward": 0.08750000149011612, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7380208432674408, "step": 2724 }, { "clip_ratio": 0.0, "completion_length": 600.4479370117188, "epoch": 0.8721395423267723, "grad_norm": 0.2968617379665375, "kl": 0.36875737756490706, "learning_rate": 9.77879393095823e-07, "loss": 0.0958, "reward": 1.8088542342185974, "reward_std": 0.21599897295236586, "rewards/accuracy_reward": 0.10416667275130749, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7421875238418579, "step": 2725 }, { "clip_ratio": 0.0, "completion_length": 565.4604339599609, "epoch": 0.8724595935349656, "grad_norm": 0.09855780750513077, "kl": 0.28418837301433086, "learning_rate": 9.730648721084601e-07, "loss": 0.0916, "reward": 1.7796875357627868, "reward_std": 0.16487232595682144, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7421875178813935, "step": 2726 }, { "clip_ratio": 0.0, "completion_length": 560.2479431152344, "epoch": 0.8727796447431589, "grad_norm": 0.23963011801242828, "kl": 0.3469237022101879, "learning_rate": 9.682616261673039e-07, "loss": 0.1049, "reward": 1.7890625476837159, "reward_std": 0.2078261062502861, "rewards/accuracy_reward": 0.09583333730697632, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7348958611488342, "step": 2727 }, { "clip_ratio": 0.0, "completion_length": 534.1500091552734, "epoch": 0.8730996959513522, "grad_norm": 0.2292696088552475, "kl": 0.2644490167498589, "learning_rate": 9.634696612718242e-07, "loss": 0.0742, "reward": 1.8494792342185975, "reward_std": 0.1921914353966713, "rewards/accuracy_reward": 0.1562500035390258, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7369791805744171, "step": 2728 }, { "clip_ratio": 0.0, "completion_length": 543.3479217529297, "epoch": 0.8734197471595455, "grad_norm": 0.11155485361814499, "kl": 0.35314866527915, "learning_rate": 9.586889834073997e-07, "loss": 0.0853, "reward": 1.7614583611488341, "reward_std": 0.2156276598572731, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7385416865348816, "step": 2729 }, { "clip_ratio": 0.0, "completion_length": 547.5625183105469, "epoch": 0.8737397983677389, "grad_norm": 0.27473822236061096, "kl": 0.3876804620027542, "learning_rate": 9.53919598545312e-07, "loss": 0.1188, "reward": 1.7916667103767394, "reward_std": 0.17802366763353347, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7437500178813934, "step": 2730 }, { "clip_ratio": 0.0, "completion_length": 550.5500183105469, "epoch": 0.8740598495759322, "grad_norm": 0.10562120378017426, "kl": 0.3450591519474983, "learning_rate": 9.491615126427356e-07, "loss": 0.138, "reward": 1.7677083611488342, "reward_std": 0.23906174153089524, "rewards/accuracy_reward": 0.07916666883975268, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7385416865348816, "step": 2731 }, { "clip_ratio": 0.0, "completion_length": 570.8187744140625, "epoch": 0.8743799007841254, "grad_norm": 0.125253826379776, "kl": 0.3737052485346794, "learning_rate": 9.444147316427332e-07, "loss": 0.0937, "reward": 1.7614583730697633, "reward_std": 0.18856341540813445, "rewards/accuracy_reward": 0.06458333432674408, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7385416865348816, "step": 2732 }, { "clip_ratio": 0.0, "completion_length": 560.6729431152344, "epoch": 0.8746999519923188, "grad_norm": 0.08726673573255539, "kl": 0.24158574528992177, "learning_rate": 9.396792614742478e-07, "loss": 0.0792, "reward": 1.7854166865348815, "reward_std": 0.2222141295671463, "rewards/accuracy_reward": 0.07500000391155481, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666865348815, "step": 2733 }, { "clip_ratio": 0.0, "completion_length": 521.0708526611328, "epoch": 0.8750200032005121, "grad_norm": 0.1890716254711151, "kl": 0.49842873513698577, "learning_rate": 9.349551080520913e-07, "loss": 0.1681, "reward": 1.7322917103767395, "reward_std": 0.24929236024618148, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.9395833551883698, "rewards/tag_count_reward": 0.7385416865348816, "step": 2734 }, { "clip_ratio": 0.0, "completion_length": 554.145849609375, "epoch": 0.8753400544087054, "grad_norm": 0.22258751094341278, "kl": 0.3496193356812, "learning_rate": 9.302422772769437e-07, "loss": 0.1171, "reward": 1.7927083849906922, "reward_std": 0.2532180845737457, "rewards/accuracy_reward": 0.10416666921228171, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7343750238418579, "step": 2735 }, { "clip_ratio": 0.0, "completion_length": 548.139599609375, "epoch": 0.8756601056168987, "grad_norm": 0.17098496854305267, "kl": 0.2673977643251419, "learning_rate": 9.255407750353429e-07, "loss": 0.0588, "reward": 1.7234375476837158, "reward_std": 0.16605336368083953, "rewards/accuracy_reward": 0.012500000186264515, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7421875178813935, "step": 2736 }, { "clip_ratio": 0.0, "completion_length": 564.7958618164063, "epoch": 0.875980156825092, "grad_norm": 0.1342511624097824, "kl": 0.3689066171646118, "learning_rate": 9.208506071996759e-07, "loss": 0.1102, "reward": 1.7838542103767394, "reward_std": 0.21994786560535431, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.9541666984558106, "rewards/tag_count_reward": 0.7317708551883697, "step": 2737 }, { "clip_ratio": 0.0, "completion_length": 578.3604431152344, "epoch": 0.8763002080332853, "grad_norm": 0.2091393917798996, "kl": 0.3664947893470526, "learning_rate": 9.161717796281677e-07, "loss": 0.1138, "reward": 1.7390625476837158, "reward_std": 0.2935358829796314, "rewards/accuracy_reward": 0.058333336003124715, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7286458492279053, "step": 2738 }, { "clip_ratio": 0.0, "completion_length": 582.3541870117188, "epoch": 0.8766202592414787, "grad_norm": 0.2279767394065857, "kl": 0.3544710837304592, "learning_rate": 9.115042981648903e-07, "loss": 0.0853, "reward": 1.7359375596046447, "reward_std": 0.2095361977815628, "rewards/accuracy_reward": 0.04166666902601719, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7442708551883698, "step": 2739 }, { "clip_ratio": 0.0, "completion_length": 555.2104370117188, "epoch": 0.8769403104496719, "grad_norm": 0.1344815194606781, "kl": 0.3071120284497738, "learning_rate": 9.068481686397324e-07, "loss": 0.0937, "reward": 1.8223958730697631, "reward_std": 0.19986422806978227, "rewards/accuracy_reward": 0.1270833358168602, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7348958551883698, "step": 2740 }, { "clip_ratio": 0.0, "completion_length": 550.1020904541016, "epoch": 0.8772603616578653, "grad_norm": 0.20531679689884186, "kl": 0.2685227755457163, "learning_rate": 9.022033968684119e-07, "loss": 0.0663, "reward": 1.839062547683716, "reward_std": 0.15720132291316985, "rewards/accuracy_reward": 0.1229166716337204, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7432291805744171, "step": 2741 }, { "clip_ratio": 0.0, "completion_length": 563.4583587646484, "epoch": 0.8775804128660586, "grad_norm": 0.1268453747034073, "kl": 0.3735459715127945, "learning_rate": 8.975699886524536e-07, "loss": 0.0884, "reward": 1.7067708849906922, "reward_std": 0.21321462467312813, "rewards/accuracy_reward": 0.014583333767950535, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7359375119209289, "step": 2742 }, { "clip_ratio": 0.0, "completion_length": 565.3875244140625, "epoch": 0.8779004640742519, "grad_norm": 0.131356343626976, "kl": 0.4861250571906567, "learning_rate": 8.929479497791926e-07, "loss": 0.0604, "reward": 1.7661458730697632, "reward_std": 0.2645136177539825, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.9458333611488342, "rewards/tag_count_reward": 0.736979192495346, "step": 2743 }, { "clip_ratio": 0.0, "completion_length": 550.4333587646485, "epoch": 0.8782205152824452, "grad_norm": 0.2043062150478363, "kl": 0.2636591024696827, "learning_rate": 8.88337286021762e-07, "loss": 0.09, "reward": 1.7682292103767394, "reward_std": 0.17424845919013024, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7390625238418579, "step": 2744 }, { "clip_ratio": 0.0, "completion_length": 574.2041809082032, "epoch": 0.8785405664906385, "grad_norm": 0.12795977294445038, "kl": 0.17618353292346, "learning_rate": 8.837380031390885e-07, "loss": 0.0719, "reward": 1.8588542342185974, "reward_std": 0.1677723281085491, "rewards/accuracy_reward": 0.1562500052154064, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7421875178813935, "step": 2745 }, { "clip_ratio": 0.0, "completion_length": 570.6916870117187, "epoch": 0.8788606176988318, "grad_norm": 0.08926547318696976, "kl": 0.1727425143122673, "learning_rate": 8.791501068758823e-07, "loss": 0.0632, "reward": 1.7614583611488341, "reward_std": 0.20442461520433425, "rewards/accuracy_reward": 0.05833333488553762, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7427083551883698, "step": 2746 }, { "clip_ratio": 0.0, "completion_length": 576.4208557128907, "epoch": 0.8791806689070252, "grad_norm": 0.24627292156219482, "kl": 0.22680708356201648, "learning_rate": 8.745736029626306e-07, "loss": 0.0524, "reward": 1.726562535762787, "reward_std": 0.12180216982960701, "rewards/accuracy_reward": 0.00416666679084301, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.7432291865348816, "step": 2747 }, { "clip_ratio": 0.0, "completion_length": 558.7125213623046, "epoch": 0.8795007201152184, "grad_norm": 0.13664305210113525, "kl": 0.23294759541749954, "learning_rate": 8.70008497115592e-07, "loss": 0.0874, "reward": 1.8510417103767396, "reward_std": 0.183480966091156, "rewards/accuracy_reward": 0.13958333879709245, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7447916865348816, "step": 2748 }, { "clip_ratio": 0.0, "completion_length": 558.5187683105469, "epoch": 0.8798207713234117, "grad_norm": 0.2415604591369629, "kl": 0.3387389235198498, "learning_rate": 8.654547950367898e-07, "loss": 0.1045, "reward": 1.7510416984558106, "reward_std": 0.18389342874288558, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7447916805744171, "step": 2749 }, { "clip_ratio": 0.0, "completion_length": 574.1229370117187, "epoch": 0.8801408225316051, "grad_norm": 0.165102019906044, "kl": 0.28490790314972403, "learning_rate": 8.609125024139986e-07, "loss": 0.0811, "reward": 1.7822917103767395, "reward_std": 0.20692486464977264, "rewards/accuracy_reward": 0.07708333600312471, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7406250238418579, "step": 2750 }, { "clip_ratio": 0.0, "completion_length": 557.864599609375, "epoch": 0.8804608737397984, "grad_norm": 0.16449257731437683, "kl": 0.31549433767795565, "learning_rate": 8.563816249207457e-07, "loss": 0.0955, "reward": 1.7578125596046448, "reward_std": 0.22044627070426942, "rewards/accuracy_reward": 0.05833333507180214, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7411458432674408, "step": 2751 }, { "clip_ratio": 0.0, "completion_length": 550.3916900634765, "epoch": 0.8807809249479917, "grad_norm": 0.14509373903274536, "kl": 0.26425855085253713, "learning_rate": 8.51862168216303e-07, "loss": 0.0705, "reward": 1.7729166984558105, "reward_std": 0.18468779399991037, "rewards/accuracy_reward": 0.06458333600312471, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7416666746139526, "step": 2752 }, { "clip_ratio": 0.0, "completion_length": 548.145849609375, "epoch": 0.881100976156185, "grad_norm": 0.17047415673732758, "kl": 0.31822266019880774, "learning_rate": 8.473541379456707e-07, "loss": 0.0821, "reward": 1.9010417342185975, "reward_std": 0.22661916613578797, "rewards/accuracy_reward": 0.18541667219251395, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7406250178813935, "step": 2753 }, { "clip_ratio": 0.0, "completion_length": 559.94794921875, "epoch": 0.8814210273643783, "grad_norm": 0.15989422798156738, "kl": 0.30939139164984225, "learning_rate": 8.428575397395833e-07, "loss": 0.0502, "reward": 1.8328125596046447, "reward_std": 0.12547328621149062, "rewards/accuracy_reward": 0.11250000353902578, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7432291865348816, "step": 2754 }, { "clip_ratio": 0.0, "completion_length": 538.9437744140625, "epoch": 0.8817410785725716, "grad_norm": 0.3773564398288727, "kl": 0.5229472696781159, "learning_rate": 8.383723792144916e-07, "loss": 0.0804, "reward": 1.723437535762787, "reward_std": 0.18466463536024094, "rewards/accuracy_reward": 0.022916667535901068, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.735937523841858, "step": 2755 }, { "clip_ratio": 0.0, "completion_length": 565.4625244140625, "epoch": 0.8820611297807649, "grad_norm": 0.11909200251102448, "kl": 0.20261020734906196, "learning_rate": 8.338986619725631e-07, "loss": 0.0616, "reward": 1.7557291984558105, "reward_std": 0.19833394810557364, "rewards/accuracy_reward": 0.05208333544433117, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7411458611488342, "step": 2756 }, { "clip_ratio": 0.0, "completion_length": 580.8541809082031, "epoch": 0.8823811809889582, "grad_norm": 0.1677577942609787, "kl": 0.2974459655582905, "learning_rate": 8.294363936016725e-07, "loss": 0.1096, "reward": 1.7640625476837157, "reward_std": 0.21791196018457412, "rewards/accuracy_reward": 0.07916666995733976, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7348958432674408, "step": 2757 }, { "clip_ratio": 0.0, "completion_length": 553.4750183105468, "epoch": 0.8827012321971516, "grad_norm": 0.3134852349758148, "kl": 0.32943947799503803, "learning_rate": 8.249855796753881e-07, "loss": 0.0577, "reward": 1.817708384990692, "reward_std": 0.18252429068088533, "rewards/accuracy_reward": 0.10208333563059568, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7427083492279053, "step": 2758 }, { "clip_ratio": 0.0, "completion_length": 565.122933959961, "epoch": 0.8830212834053448, "grad_norm": 0.21888695657253265, "kl": 0.15557781457901002, "learning_rate": 8.205462257529795e-07, "loss": 0.0677, "reward": 1.7666667222976684, "reward_std": 0.17220090329647064, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7416666865348815, "step": 2759 }, { "clip_ratio": 0.0, "completion_length": 566.1833557128906, "epoch": 0.8833413346135381, "grad_norm": 0.15581285953521729, "kl": 0.21956364698708059, "learning_rate": 8.161183373793968e-07, "loss": 0.0811, "reward": 1.7536458730697633, "reward_std": 0.15556090101599693, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.9645833432674408, "rewards/tag_count_reward": 0.7432291746139527, "step": 2760 }, { "clip_ratio": 0.0, "completion_length": 549.208349609375, "epoch": 0.8836613858217315, "grad_norm": 0.30839839577674866, "kl": 0.334910923242569, "learning_rate": 8.117019200852716e-07, "loss": 0.0615, "reward": 1.7645833492279053, "reward_std": 0.1759663164615631, "rewards/accuracy_reward": 0.05000000074505806, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7437500178813934, "step": 2761 }, { "clip_ratio": 0.0, "completion_length": 583.3271118164063, "epoch": 0.8839814370299248, "grad_norm": 0.15235859155654907, "kl": 0.30991976857185366, "learning_rate": 8.07296979386909e-07, "loss": 0.0648, "reward": 1.8390625596046448, "reward_std": 0.1289732813835144, "rewards/accuracy_reward": 0.12291667014360427, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7473958492279053, "step": 2762 }, { "clip_ratio": 0.0, "completion_length": 560.8354339599609, "epoch": 0.884301488238118, "grad_norm": 0.19533921778202057, "kl": 0.2847344473004341, "learning_rate": 8.029035207862712e-07, "loss": 0.1147, "reward": 1.7619791984558106, "reward_std": 0.2641393207013607, "rewards/accuracy_reward": 0.07500000074505805, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7328125178813935, "step": 2763 }, { "clip_ratio": 0.0, "completion_length": 543.4291931152344, "epoch": 0.8846215394463114, "grad_norm": 0.12322978675365448, "kl": 0.21314894780516624, "learning_rate": 7.985215497709909e-07, "loss": 0.0805, "reward": 1.8369792103767395, "reward_std": 0.19562975615262984, "rewards/accuracy_reward": 0.12708333600312471, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.7411458551883697, "step": 2764 }, { "clip_ratio": 0.0, "completion_length": 552.8562744140625, "epoch": 0.8849415906545047, "grad_norm": 0.08437777310609818, "kl": 0.26649289689958094, "learning_rate": 7.94151071814343e-07, "loss": 0.0789, "reward": 1.7765625357627868, "reward_std": 0.18437634333968161, "rewards/accuracy_reward": 0.07291667070239782, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7390625178813934, "step": 2765 }, { "clip_ratio": 0.0, "completion_length": 558.8729431152344, "epoch": 0.8852616418626981, "grad_norm": 0.2084723263978958, "kl": 0.29305521994829176, "learning_rate": 7.897920923752533e-07, "loss": 0.0981, "reward": 1.764062523841858, "reward_std": 0.2217009961605072, "rewards/accuracy_reward": 0.06458333544433117, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7411458492279053, "step": 2766 }, { "clip_ratio": 0.0, "completion_length": 519.0062622070312, "epoch": 0.8855816930708913, "grad_norm": 0.12448950111865997, "kl": 0.28461782485246656, "learning_rate": 7.854446168982777e-07, "loss": 0.0964, "reward": 1.8291667103767395, "reward_std": 0.2683658979833126, "rewards/accuracy_reward": 0.13125000502914191, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7395833611488343, "step": 2767 }, { "clip_ratio": 0.0, "completion_length": 558.808349609375, "epoch": 0.8859017442790846, "grad_norm": 0.11394886672496796, "kl": 0.2418998047709465, "learning_rate": 7.811086508136112e-07, "loss": 0.0943, "reward": 1.7395833611488343, "reward_std": 0.16707825362682344, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7395833551883697, "step": 2768 }, { "clip_ratio": 0.0, "completion_length": 550.5979400634766, "epoch": 0.886221795487278, "grad_norm": 0.2881830930709839, "kl": 0.3590947136282921, "learning_rate": 7.767841995370673e-07, "loss": 0.1011, "reward": 1.8229166865348816, "reward_std": 0.21268870532512665, "rewards/accuracy_reward": 0.12291666865348816, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7375000178813934, "step": 2769 }, { "clip_ratio": 0.0, "completion_length": 540.252099609375, "epoch": 0.8865418466954713, "grad_norm": 0.16620886325836182, "kl": 0.18743032775819302, "learning_rate": 7.724712684700819e-07, "loss": 0.0947, "reward": 1.7177083849906922, "reward_std": 0.17939387410879135, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7406250238418579, "step": 2770 }, { "clip_ratio": 0.0, "completion_length": 563.0541809082031, "epoch": 0.8868618979036645, "grad_norm": 0.1098373681306839, "kl": 0.1804877772927284, "learning_rate": 7.681698629996959e-07, "loss": 0.0475, "reward": 1.8322917342185974, "reward_std": 0.10751088634133339, "rewards/accuracy_reward": 0.10208333656191826, "rewards/format_reward": 0.9833333492279053, "rewards/tag_count_reward": 0.7468750178813934, "step": 2771 }, { "clip_ratio": 0.0, "completion_length": 542.4771026611328, "epoch": 0.8871819491118579, "grad_norm": 0.337385356426239, "kl": 0.18855189830064772, "learning_rate": 7.638799884985593e-07, "loss": 0.0544, "reward": 1.7598958492279053, "reward_std": 0.16964001804590226, "rewards/accuracy_reward": 0.04375000055879354, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.745312511920929, "step": 2772 }, { "clip_ratio": 0.0, "completion_length": 546.3062622070313, "epoch": 0.8875020003200512, "grad_norm": 0.16040952503681183, "kl": 0.37417167574167254, "learning_rate": 7.59601650324917e-07, "loss": 0.0889, "reward": 1.7911458611488342, "reward_std": 0.1644122764468193, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.9666666746139526, "rewards/tag_count_reward": 0.743229192495346, "step": 2773 }, { "clip_ratio": 0.0, "completion_length": 552.3562683105469, "epoch": 0.8878220515282446, "grad_norm": 0.182782843708992, "kl": 0.19795044660568237, "learning_rate": 7.553348538226079e-07, "loss": 0.074, "reward": 1.778125035762787, "reward_std": 0.1225196048617363, "rewards/accuracy_reward": 0.06875000204890966, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.744791692495346, "step": 2774 }, { "clip_ratio": 0.0, "completion_length": 568.7416870117188, "epoch": 0.8881421027364378, "grad_norm": 0.3356267511844635, "kl": 0.27316139116883276, "learning_rate": 7.510796043210477e-07, "loss": 0.104, "reward": 1.809895884990692, "reward_std": 0.15356628447771073, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7432291865348816, "step": 2775 }, { "clip_ratio": 0.0, "completion_length": 528.4791870117188, "epoch": 0.8884621539446311, "grad_norm": 0.11805539578199387, "kl": 0.18329550586640836, "learning_rate": 7.468359071352338e-07, "loss": 0.0604, "reward": 1.8666667222976685, "reward_std": 0.20947734415531158, "rewards/accuracy_reward": 0.15208333637565374, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7395833551883697, "step": 2776 }, { "clip_ratio": 0.0, "completion_length": 558.1708587646484, "epoch": 0.8887822051528245, "grad_norm": 0.08612954616546631, "kl": 0.15863345116376876, "learning_rate": 7.426037675657361e-07, "loss": 0.0447, "reward": 1.8442708730697632, "reward_std": 0.164375888556242, "rewards/accuracy_reward": 0.1208333345130086, "rewards/format_reward": 0.9791666865348816, "rewards/tag_count_reward": 0.7442708432674408, "step": 2777 }, { "clip_ratio": 0.0, "completion_length": 529.7875030517578, "epoch": 0.8891022563610178, "grad_norm": 0.10837756097316742, "kl": 0.21986165940761565, "learning_rate": 7.383831908986849e-07, "loss": 0.054, "reward": 1.8770833969116212, "reward_std": 0.2275755934417248, "rewards/accuracy_reward": 0.15625000596046448, "rewards/format_reward": 0.9750000238418579, "rewards/tag_count_reward": 0.7458333551883698, "step": 2778 }, { "clip_ratio": 0.0, "completion_length": 556.6583526611328, "epoch": 0.889422307569211, "grad_norm": 0.1390489637851715, "kl": 0.1624234464019537, "learning_rate": 7.341741824057713e-07, "loss": 0.0748, "reward": 1.863020896911621, "reward_std": 0.17711979597806932, "rewards/accuracy_reward": 0.15000000335276126, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7401041805744171, "step": 2779 }, { "clip_ratio": 0.0, "completion_length": 543.8541809082031, "epoch": 0.8897423587774044, "grad_norm": 0.13301320374011993, "kl": 0.20487010031938552, "learning_rate": 7.299767473442332e-07, "loss": 0.0712, "reward": 1.806770884990692, "reward_std": 0.17180782556533813, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7421875238418579, "step": 2780 }, { "clip_ratio": 0.0, "completion_length": 586.8833557128906, "epoch": 0.8900624099855977, "grad_norm": 0.10869420319795609, "kl": 0.18663627617061138, "learning_rate": 7.257908909568567e-07, "loss": 0.077, "reward": 1.7598958730697631, "reward_std": 0.1811968594789505, "rewards/accuracy_reward": 0.04583333395421505, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7473958492279053, "step": 2781 }, { "clip_ratio": 0.0, "completion_length": 574.9416870117187, "epoch": 0.890382461193791, "grad_norm": 0.1881677657365799, "kl": 0.3977797865867615, "learning_rate": 7.216166184719653e-07, "loss": 0.1338, "reward": 1.6932291865348816, "reward_std": 0.2559267617762089, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7328125238418579, "step": 2782 }, { "clip_ratio": 0.0, "completion_length": 545.9166870117188, "epoch": 0.8907025124019843, "grad_norm": 0.14625728130340576, "kl": 0.1800611212849617, "learning_rate": 7.174539351034071e-07, "loss": 0.0586, "reward": 1.8166667342185974, "reward_std": 0.17907868325710297, "rewards/accuracy_reward": 0.10000000353902579, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7416666805744171, "step": 2783 }, { "clip_ratio": 0.0, "completion_length": 571.2916870117188, "epoch": 0.8910225636101776, "grad_norm": 0.324567049741745, "kl": 0.268314453586936, "learning_rate": 7.133028460505642e-07, "loss": 0.1042, "reward": 1.7411458849906922, "reward_std": 0.22085545733571052, "rewards/accuracy_reward": 0.04791666734963655, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7369791805744171, "step": 2784 }, { "clip_ratio": 0.0, "completion_length": 561.4229309082032, "epoch": 0.891342614818371, "grad_norm": 0.23639392852783203, "kl": 0.4426693290472031, "learning_rate": 7.091633564983314e-07, "loss": 0.129, "reward": 1.8010417103767395, "reward_std": 0.3021409660577774, "rewards/accuracy_reward": 0.11458333656191826, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7364583492279053, "step": 2785 }, { "clip_ratio": 0.0, "completion_length": 566.8229370117188, "epoch": 0.8916626660265643, "grad_norm": 0.13480092585086823, "kl": 0.34565304294228555, "learning_rate": 7.05035471617117e-07, "loss": 0.1022, "reward": 1.7161458730697632, "reward_std": 0.20841763466596602, "rewards/accuracy_reward": 0.02500000149011612, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7390625178813934, "step": 2786 }, { "clip_ratio": 0.0, "completion_length": 563.7416809082031, "epoch": 0.8919827172347575, "grad_norm": 0.23543386161327362, "kl": 0.29164321571588514, "learning_rate": 7.009191965628348e-07, "loss": 0.0929, "reward": 1.7140625238418579, "reward_std": 0.22377543151378632, "rewards/accuracy_reward": 0.02291666679084301, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7432291865348816, "step": 2787 }, { "clip_ratio": 0.0, "completion_length": 546.6020965576172, "epoch": 0.8923027684429509, "grad_norm": 0.08022233843803406, "kl": 0.18929398953914642, "learning_rate": 6.96814536476893e-07, "loss": 0.0559, "reward": 1.7442708611488342, "reward_std": 0.15617139339447023, "rewards/accuracy_reward": 0.0291666679084301, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7442708432674408, "step": 2788 }, { "clip_ratio": 0.0, "completion_length": 552.7229431152343, "epoch": 0.8926228196511442, "grad_norm": 0.12308970093727112, "kl": 0.3831124782562256, "learning_rate": 6.927214964861995e-07, "loss": 0.0681, "reward": 1.9031250596046447, "reward_std": 0.15697635114192962, "rewards/accuracy_reward": 0.19375000521540642, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7427083611488342, "step": 2789 }, { "clip_ratio": 0.0, "completion_length": 578.0833557128906, "epoch": 0.8929428708593375, "grad_norm": 0.16935700178146362, "kl": 0.3776381004601717, "learning_rate": 6.886400817031435e-07, "loss": 0.1194, "reward": 1.7723958969116211, "reward_std": 0.21539193615317345, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.739062511920929, "step": 2790 }, { "clip_ratio": 0.0, "completion_length": 570.7687683105469, "epoch": 0.8932629220675308, "grad_norm": 0.08544913679361343, "kl": 0.24526721499860288, "learning_rate": 6.845702972255974e-07, "loss": 0.0746, "reward": 1.7822917222976684, "reward_std": 0.18180104196071625, "rewards/accuracy_reward": 0.08125000316649675, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7427083492279053, "step": 2791 }, { "clip_ratio": 0.0, "completion_length": 528.1562774658203, "epoch": 0.8935829732757241, "grad_norm": 0.16124188899993896, "kl": 0.29785202592611315, "learning_rate": 6.805121481368993e-07, "loss": 0.087, "reward": 1.8942708730697633, "reward_std": 0.24526706635951995, "rewards/accuracy_reward": 0.18125000819563866, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7401041924953461, "step": 2792 }, { "clip_ratio": 0.0, "completion_length": 553.9500183105469, "epoch": 0.8939030244839175, "grad_norm": 0.3463839590549469, "kl": 0.2856299549341202, "learning_rate": 6.764656395058622e-07, "loss": 0.0934, "reward": 1.7390625119209289, "reward_std": 0.21077930554747581, "rewards/accuracy_reward": 0.037500000558793545, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7432291865348816, "step": 2793 }, { "clip_ratio": 0.0, "completion_length": 568.3375122070313, "epoch": 0.8942230756921108, "grad_norm": 0.13446158170700073, "kl": 0.31556113585829737, "learning_rate": 6.724307763867555e-07, "loss": 0.0784, "reward": 1.7307292103767395, "reward_std": 0.19142997413873672, "rewards/accuracy_reward": 0.02708333395421505, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7411458551883697, "step": 2794 }, { "clip_ratio": 0.0, "completion_length": 569.5104400634766, "epoch": 0.894543126900304, "grad_norm": 0.33578991889953613, "kl": 0.3360072895884514, "learning_rate": 6.684075638193066e-07, "loss": 0.1057, "reward": 1.7260416984558105, "reward_std": 0.24067014306783677, "rewards/accuracy_reward": 0.04375000223517418, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7343750238418579, "step": 2795 }, { "clip_ratio": 0.0, "completion_length": 557.8666839599609, "epoch": 0.8948631781084974, "grad_norm": 0.219113290309906, "kl": 0.25697992742061615, "learning_rate": 6.643960068286814e-07, "loss": 0.1114, "reward": 1.7854167103767395, "reward_std": 0.19451157450675965, "rewards/accuracy_reward": 0.08750000111758709, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7395833551883697, "step": 2796 }, { "clip_ratio": 0.0, "completion_length": 572.0541809082031, "epoch": 0.8951832293166907, "grad_norm": 0.1934441775083542, "kl": 0.5528342947363853, "learning_rate": 6.603961104255018e-07, "loss": 0.1239, "reward": 1.6994791984558106, "reward_std": 0.3112195998430252, "rewards/accuracy_reward": 0.04166666753590107, "rewards/format_reward": 0.9270833432674408, "rewards/tag_count_reward": 0.7307291805744172, "step": 2797 }, { "clip_ratio": 0.0, "completion_length": 547.883349609375, "epoch": 0.8955032805248839, "grad_norm": 0.07793132960796356, "kl": 0.2168126493692398, "learning_rate": 6.564078796058137e-07, "loss": 0.0569, "reward": 1.726562511920929, "reward_std": 0.1572144016623497, "rewards/accuracy_reward": 0.014583333395421505, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7411458492279053, "step": 2798 }, { "clip_ratio": 0.0, "completion_length": 541.3062713623046, "epoch": 0.8958233317330773, "grad_norm": 0.12241532653570175, "kl": 0.33055841401219366, "learning_rate": 6.52431319351099e-07, "loss": 0.0506, "reward": 1.7161458611488343, "reward_std": 0.16185689046978952, "rewards/accuracy_reward": 0.00833333358168602, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7411458551883697, "step": 2799 }, { "clip_ratio": 0.0, "completion_length": 553.6333465576172, "epoch": 0.8961433829412706, "grad_norm": 0.18630848824977875, "kl": 0.3077032431960106, "learning_rate": 6.484664346282555e-07, "loss": 0.1194, "reward": 1.7812500476837159, "reward_std": 0.2138199493288994, "rewards/accuracy_reward": 0.09166667070239783, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7354166805744171, "step": 2800 }, { "clip_ratio": 0.0, "completion_length": 571.1375213623047, "epoch": 0.896463434149464, "grad_norm": 0.13901875913143158, "kl": 0.32606543600559235, "learning_rate": 6.44513230389604e-07, "loss": 0.0603, "reward": 1.793750035762787, "reward_std": 0.18691025376319886, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7395833551883697, "step": 2801 }, { "clip_ratio": 0.0, "completion_length": 603.393765258789, "epoch": 0.8967834853576572, "grad_norm": 0.22432410717010498, "kl": 0.47303307950496676, "learning_rate": 6.405717115728727e-07, "loss": 0.1022, "reward": 1.7541667103767395, "reward_std": 0.2878709942102432, "rewards/accuracy_reward": 0.08541666846722365, "rewards/format_reward": 0.9354166924953461, "rewards/tag_count_reward": 0.7333333551883697, "step": 2802 }, { "clip_ratio": 0.0, "completion_length": 570.6562744140625, "epoch": 0.8971035365658505, "grad_norm": 0.424375057220459, "kl": 0.3386325158178806, "learning_rate": 6.366418831011955e-07, "loss": 0.0917, "reward": 1.7348958730697632, "reward_std": 0.22167600244283675, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.743229192495346, "step": 2803 }, { "clip_ratio": 0.0, "completion_length": 568.420849609375, "epoch": 0.8974235877740439, "grad_norm": 0.20574527978897095, "kl": 0.27687034383416176, "learning_rate": 6.32723749883104e-07, "loss": 0.0859, "reward": 1.7479166984558105, "reward_std": 0.23028158992528916, "rewards/accuracy_reward": 0.05416666753590107, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7375000238418579, "step": 2804 }, { "clip_ratio": 0.0, "completion_length": 543.4354370117187, "epoch": 0.8977436389822372, "grad_norm": 0.13979966938495636, "kl": 0.29248685389757156, "learning_rate": 6.288173168125234e-07, "loss": 0.0998, "reward": 1.7921875357627868, "reward_std": 0.22479058653116227, "rewards/accuracy_reward": 0.09375000242143869, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7463541865348816, "step": 2805 }, { "clip_ratio": 0.0, "completion_length": 566.677099609375, "epoch": 0.8980636901904304, "grad_norm": 0.3141164481639862, "kl": 0.4433578472584486, "learning_rate": 6.249225887687615e-07, "loss": 0.1531, "reward": 1.7578125476837159, "reward_std": 0.29427343755960467, "rewards/accuracy_reward": 0.08333333507180214, "rewards/format_reward": 0.9375000238418579, "rewards/tag_count_reward": 0.7369791865348816, "step": 2806 }, { "clip_ratio": 0.0, "completion_length": 544.2291809082031, "epoch": 0.8983837413986238, "grad_norm": 0.12283515930175781, "kl": 0.2124796152114868, "learning_rate": 6.210395706165106e-07, "loss": 0.0779, "reward": 1.8734375476837157, "reward_std": 0.18899996876716613, "rewards/accuracy_reward": 0.15833333525806664, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7442708492279053, "step": 2807 }, { "clip_ratio": 0.0, "completion_length": 568.6562683105469, "epoch": 0.8987037926068171, "grad_norm": 0.2164728194475174, "kl": 0.21420424431562424, "learning_rate": 6.171682672058322e-07, "loss": 0.075, "reward": 1.7364583611488342, "reward_std": 0.18904096335172654, "rewards/accuracy_reward": 0.031250000558793546, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7427083611488342, "step": 2808 }, { "clip_ratio": 0.0, "completion_length": 566.497933959961, "epoch": 0.8990238438150104, "grad_norm": 0.19198265671730042, "kl": 0.29132072255015373, "learning_rate": 6.133086833721569e-07, "loss": 0.0874, "reward": 1.814583384990692, "reward_std": 0.2104831539094448, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7395833611488343, "step": 2809 }, { "clip_ratio": 0.0, "completion_length": 546.2562591552735, "epoch": 0.8993438950232037, "grad_norm": 0.15204951167106628, "kl": 0.2960913643240929, "learning_rate": 6.094608239362799e-07, "loss": 0.1252, "reward": 1.8786458849906922, "reward_std": 0.2817792721092701, "rewards/accuracy_reward": 0.1875000072643161, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7369791805744171, "step": 2810 }, { "clip_ratio": 0.0, "completion_length": 565.775015258789, "epoch": 0.899663946231397, "grad_norm": 0.19675280153751373, "kl": 0.2733559250831604, "learning_rate": 6.056246937043475e-07, "loss": 0.1022, "reward": 1.7692708492279052, "reward_std": 0.16505006551742554, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7380208611488343, "step": 2811 }, { "clip_ratio": 0.0, "completion_length": 573.0083465576172, "epoch": 0.8999839974395903, "grad_norm": 0.29460424184799194, "kl": 0.3481321565806866, "learning_rate": 6.018002974678616e-07, "loss": 0.0974, "reward": 1.8307292342185975, "reward_std": 0.22881743013858796, "rewards/accuracy_reward": 0.1395833384245634, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7369791805744171, "step": 2812 }, { "clip_ratio": 0.0, "completion_length": 540.6479370117188, "epoch": 0.9003040486477837, "grad_norm": 0.12847913801670074, "kl": 0.3264305554330349, "learning_rate": 5.979876400036599e-07, "loss": 0.1143, "reward": 1.7645833611488342, "reward_std": 0.2144318014383316, "rewards/accuracy_reward": 0.0666666692122817, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7395833432674408, "step": 2813 }, { "clip_ratio": 0.0, "completion_length": 562.5354370117187, "epoch": 0.9006240998559769, "grad_norm": 0.2630733549594879, "kl": 0.428336625546217, "learning_rate": 5.941867260739265e-07, "loss": 0.091, "reward": 1.7848958730697633, "reward_std": 0.1891431801021099, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.9583333432674408, "rewards/tag_count_reward": 0.7411458432674408, "step": 2814 }, { "clip_ratio": 0.0, "completion_length": 545.5000152587891, "epoch": 0.9009441510641703, "grad_norm": 0.12521198391914368, "kl": 0.32432365864515306, "learning_rate": 5.903975604261725e-07, "loss": 0.0986, "reward": 1.8453125596046447, "reward_std": 0.20809532403945924, "rewards/accuracy_reward": 0.14166667237877845, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7411458551883697, "step": 2815 }, { "clip_ratio": 0.0, "completion_length": 582.6250183105469, "epoch": 0.9012642022723636, "grad_norm": 0.1308985948562622, "kl": 0.335451889783144, "learning_rate": 5.866201477932321e-07, "loss": 0.0883, "reward": 1.7958333611488342, "reward_std": 0.2012617200613022, "rewards/accuracy_reward": 0.10000000260770321, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7395833492279053, "step": 2816 }, { "clip_ratio": 0.0, "completion_length": 535.7479370117187, "epoch": 0.9015842534805569, "grad_norm": 0.09408943355083466, "kl": 0.24329584948718547, "learning_rate": 5.828544928932655e-07, "loss": 0.0881, "reward": 1.8208333849906921, "reward_std": 0.22802594751119615, "rewards/accuracy_reward": 0.11041667051613331, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7437500178813934, "step": 2817 }, { "clip_ratio": 0.0, "completion_length": 557.4146057128906, "epoch": 0.9019043046887502, "grad_norm": 0.2508453130722046, "kl": 0.42456541061401365, "learning_rate": 5.791006004297451e-07, "loss": 0.1234, "reward": 1.7947917342185975, "reward_std": 0.21245116442441941, "rewards/accuracy_reward": 0.10000000316649675, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7385416984558105, "step": 2818 }, { "clip_ratio": 0.0, "completion_length": 564.8187683105468, "epoch": 0.9022243558969435, "grad_norm": 0.19571591913700104, "kl": 0.3216987043619156, "learning_rate": 5.753584750914476e-07, "loss": 0.128, "reward": 1.8833333611488343, "reward_std": 0.2775250434875488, "rewards/accuracy_reward": 0.193750006146729, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7354166805744171, "step": 2819 }, { "clip_ratio": 0.0, "completion_length": 555.6791748046875, "epoch": 0.9025444071051368, "grad_norm": 0.23652778565883636, "kl": 0.20177725926041604, "learning_rate": 5.7162812155246e-07, "loss": 0.0781, "reward": 1.751562523841858, "reward_std": 0.21785627081990241, "rewards/accuracy_reward": 0.05208333414047957, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7432291865348816, "step": 2820 }, { "clip_ratio": 0.0, "completion_length": 526.1937744140625, "epoch": 0.9028644583133302, "grad_norm": 0.16797806322574615, "kl": 0.18671303614974022, "learning_rate": 5.679095444721538e-07, "loss": 0.0619, "reward": 1.8432292103767396, "reward_std": 0.1968873217701912, "rewards/accuracy_reward": 0.13333333600312472, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7432291805744171, "step": 2821 }, { "clip_ratio": 0.0, "completion_length": 571.0875183105469, "epoch": 0.9031845095215234, "grad_norm": 0.1344435214996338, "kl": 0.252733601629734, "learning_rate": 5.64202748495204e-07, "loss": 0.1045, "reward": 1.7807292222976685, "reward_std": 0.23075918704271317, "rewards/accuracy_reward": 0.08125000204890967, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7411458611488342, "step": 2822 }, { "clip_ratio": 0.0, "completion_length": 559.1604431152343, "epoch": 0.9035045607297167, "grad_norm": 0.21127544343471527, "kl": 0.2684433352202177, "learning_rate": 5.605077382515644e-07, "loss": 0.0994, "reward": 1.7687500357627868, "reward_std": 0.1979643739759922, "rewards/accuracy_reward": 0.07291666939854621, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.737500011920929, "step": 2823 }, { "clip_ratio": 0.0, "completion_length": 550.8312683105469, "epoch": 0.9038246119379101, "grad_norm": 0.14521047472953796, "kl": 0.23440488129854203, "learning_rate": 5.568245183564669e-07, "loss": 0.0719, "reward": 1.7739583849906921, "reward_std": 0.20358410999178886, "rewards/accuracy_reward": 0.07083333358168602, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7427083492279053, "step": 2824 }, { "clip_ratio": 0.0, "completion_length": 562.9291839599609, "epoch": 0.9041446631461034, "grad_norm": 0.20504160225391388, "kl": 0.301665635406971, "learning_rate": 5.531530934104179e-07, "loss": 0.0843, "reward": 1.728645884990692, "reward_std": 0.21848196685314178, "rewards/accuracy_reward": 0.02916666679084301, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.739062511920929, "step": 2825 }, { "clip_ratio": 0.0, "completion_length": 565.1000183105468, "epoch": 0.9044647143542967, "grad_norm": 0.08053412288427353, "kl": 0.2033343430608511, "learning_rate": 5.494934679991914e-07, "loss": 0.0564, "reward": 1.7614583611488341, "reward_std": 0.16279089897871019, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7406250059604644, "step": 2826 }, { "clip_ratio": 0.0, "completion_length": 576.7062622070313, "epoch": 0.90478476556249, "grad_norm": 0.24011105298995972, "kl": 0.26724216900765896, "learning_rate": 5.458456466938233e-07, "loss": 0.0639, "reward": 1.8031250357627868, "reward_std": 0.18479779735207558, "rewards/accuracy_reward": 0.10208333637565374, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7364583432674408, "step": 2827 }, { "clip_ratio": 0.0, "completion_length": 549.0625152587891, "epoch": 0.9051048167706833, "grad_norm": 0.09952478110790253, "kl": 0.2889927580952644, "learning_rate": 5.422096340506089e-07, "loss": 0.0875, "reward": 1.7843750357627868, "reward_std": 0.17545911446213722, "rewards/accuracy_reward": 0.07708333786576986, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7406250178813935, "step": 2828 }, { "clip_ratio": 0.0, "completion_length": 560.0750183105469, "epoch": 0.9054248679788767, "grad_norm": 0.16065450012683868, "kl": 0.31739690750837324, "learning_rate": 5.385854346110853e-07, "loss": 0.1125, "reward": 1.8359375238418578, "reward_std": 0.2566369533538818, "rewards/accuracy_reward": 0.145833339355886, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7338541865348815, "step": 2829 }, { "clip_ratio": 0.0, "completion_length": 592.0604431152344, "epoch": 0.9057449191870699, "grad_norm": 0.19560876488685608, "kl": 0.30017624273896215, "learning_rate": 5.349730529020436e-07, "loss": 0.1009, "reward": 1.7458333730697633, "reward_std": 0.2519936338067055, "rewards/accuracy_reward": 0.06458333637565375, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7395833492279053, "step": 2830 }, { "clip_ratio": 0.0, "completion_length": 568.1521087646485, "epoch": 0.9060649703952632, "grad_norm": 0.15118496119976044, "kl": 0.21989080756902696, "learning_rate": 5.313724934355102e-07, "loss": 0.0791, "reward": 1.7239583611488343, "reward_std": 0.15902083963155747, "rewards/accuracy_reward": 0.01041666679084301, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7447916984558105, "step": 2831 }, { "clip_ratio": 0.0, "completion_length": 593.327099609375, "epoch": 0.9063850216034566, "grad_norm": 0.262523353099823, "kl": 0.49291563779115677, "learning_rate": 5.277837607087455e-07, "loss": 0.1086, "reward": 1.7229166865348815, "reward_std": 0.24671917259693146, "rewards/accuracy_reward": 0.04166666828095913, "rewards/format_reward": 0.9416666924953461, "rewards/tag_count_reward": 0.7395833611488343, "step": 2832 }, { "clip_ratio": 0.0, "completion_length": 564.0666809082031, "epoch": 0.9067050728116499, "grad_norm": 0.08810193836688995, "kl": 0.2723796620965004, "learning_rate": 5.242068592042349e-07, "loss": 0.0875, "reward": 1.7750000476837158, "reward_std": 0.20654670670628547, "rewards/accuracy_reward": 0.07708333730697632, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7416666865348815, "step": 2833 }, { "clip_ratio": 0.0, "completion_length": 552.1396087646484, "epoch": 0.9070251240198431, "grad_norm": 0.1329844743013382, "kl": 0.37486855015158654, "learning_rate": 5.206417933896901e-07, "loss": 0.0794, "reward": 1.7984375357627869, "reward_std": 0.21113753989338874, "rewards/accuracy_reward": 0.09791666865348816, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7401041746139526, "step": 2834 }, { "clip_ratio": 0.0, "completion_length": 580.2416870117188, "epoch": 0.9073451752280365, "grad_norm": 0.09566251188516617, "kl": 0.24509716033935547, "learning_rate": 5.170885677180382e-07, "loss": 0.0561, "reward": 1.812500035762787, "reward_std": 0.1898981362581253, "rewards/accuracy_reward": 0.11250000428408384, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7437500119209289, "step": 2835 }, { "clip_ratio": 0.0, "completion_length": 568.6187683105469, "epoch": 0.9076652264362298, "grad_norm": 0.16700927913188934, "kl": 0.38997380435466766, "learning_rate": 5.135471866274167e-07, "loss": 0.1089, "reward": 1.8125000476837159, "reward_std": 0.2434275358915329, "rewards/accuracy_reward": 0.1333333384245634, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.735416692495346, "step": 2836 }, { "clip_ratio": 0.0, "completion_length": 553.7104309082031, "epoch": 0.9079852776444232, "grad_norm": 0.16015952825546265, "kl": 0.2919667445123196, "learning_rate": 5.100176545411706e-07, "loss": 0.0978, "reward": 1.7885417103767396, "reward_std": 0.24260507076978682, "rewards/accuracy_reward": 0.09583333786576986, "rewards/format_reward": 0.9604166746139526, "rewards/tag_count_reward": 0.7322916805744171, "step": 2837 }, { "clip_ratio": 0.0, "completion_length": 552.6479339599609, "epoch": 0.9083053288526164, "grad_norm": 0.10905592143535614, "kl": 0.23061162009835243, "learning_rate": 5.064999758678391e-07, "loss": 0.0614, "reward": 1.8208333849906921, "reward_std": 0.15975419506430627, "rewards/accuracy_reward": 0.11041666939854622, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7416666746139526, "step": 2838 }, { "clip_ratio": 0.0, "completion_length": 565.3021026611328, "epoch": 0.9086253800608097, "grad_norm": 0.1088496744632721, "kl": 0.28061444610357283, "learning_rate": 5.029941550011663e-07, "loss": 0.0869, "reward": 1.8427083611488342, "reward_std": 0.20039626583456993, "rewards/accuracy_reward": 0.13750000353902578, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7385416805744172, "step": 2839 }, { "clip_ratio": 0.0, "completion_length": 516.6416839599609, "epoch": 0.9089454312690031, "grad_norm": 0.17556871473789215, "kl": 0.37912697792053224, "learning_rate": 4.995001963200763e-07, "loss": 0.111, "reward": 1.8447916865348817, "reward_std": 0.2618077598512173, "rewards/accuracy_reward": 0.16041667386889458, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7343750119209289, "step": 2840 }, { "clip_ratio": 0.0, "completion_length": 553.7729339599609, "epoch": 0.9092654824771963, "grad_norm": 0.12427106499671936, "kl": 0.2443702958524227, "learning_rate": 4.960181041886802e-07, "loss": 0.0688, "reward": 1.7322916984558105, "reward_std": 0.16462817713618277, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7447916865348816, "step": 2841 }, { "clip_ratio": 0.0, "completion_length": 558.7041809082032, "epoch": 0.9095855336853896, "grad_norm": 0.18514235317707062, "kl": 0.22610717713832856, "learning_rate": 4.925478829562668e-07, "loss": 0.0676, "reward": 1.8135417222976684, "reward_std": 0.21168894320726395, "rewards/accuracy_reward": 0.10625000298023224, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7385416805744172, "step": 2842 }, { "clip_ratio": 0.0, "completion_length": 551.4666809082031, "epoch": 0.909905584893583, "grad_norm": 0.13283954560756683, "kl": 0.22682435177266597, "learning_rate": 4.89089536957299e-07, "loss": 0.063, "reward": 1.7192708730697632, "reward_std": 0.13977629393339158, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7421875119209289, "step": 2843 }, { "clip_ratio": 0.0, "completion_length": 568.0479431152344, "epoch": 0.9102256361017763, "grad_norm": 0.1513800472021103, "kl": 0.30399431884288786, "learning_rate": 4.856430705114035e-07, "loss": 0.077, "reward": 1.7833333611488342, "reward_std": 0.17256311923265458, "rewards/accuracy_reward": 0.06875000167638064, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7458333551883698, "step": 2844 }, { "clip_ratio": 0.0, "completion_length": 516.5208557128906, "epoch": 0.9105456873099695, "grad_norm": 0.11325372755527496, "kl": 0.19456406235694884, "learning_rate": 4.822084879233746e-07, "loss": 0.0756, "reward": 1.8213542222976684, "reward_std": 0.18610538244247438, "rewards/accuracy_reward": 0.11041666902601718, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7421875178813935, "step": 2845 }, { "clip_ratio": 0.0, "completion_length": 559.3896087646484, "epoch": 0.9108657385181629, "grad_norm": 0.1760214865207672, "kl": 0.46450803726911544, "learning_rate": 4.787857934831564e-07, "loss": 0.1017, "reward": 1.7854166984558106, "reward_std": 0.21061150655150412, "rewards/accuracy_reward": 0.08541666902601719, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7395833611488343, "step": 2846 }, { "clip_ratio": 0.0, "completion_length": 584.2104309082031, "epoch": 0.9111857897263562, "grad_norm": 0.1330924779176712, "kl": 0.21791302636265755, "learning_rate": 4.7537499146584896e-07, "loss": 0.0718, "reward": 1.8531250357627869, "reward_std": 0.2734475418925285, "rewards/accuracy_reward": 0.14791666977107526, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7385416805744172, "step": 2847 }, { "clip_ratio": 0.0, "completion_length": 559.4687652587891, "epoch": 0.9115058409345496, "grad_norm": 0.19432726502418518, "kl": 0.2989140644669533, "learning_rate": 4.7197608613169685e-07, "loss": 0.0681, "reward": 1.7250000357627868, "reward_std": 0.1789463460445404, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7395833492279053, "step": 2848 }, { "clip_ratio": 0.0, "completion_length": 558.6625244140625, "epoch": 0.9118258921427428, "grad_norm": 0.28562673926353455, "kl": 0.31079639568924905, "learning_rate": 4.6858908172608743e-07, "loss": 0.0882, "reward": 1.839583396911621, "reward_std": 0.19922738000750542, "rewards/accuracy_reward": 0.13541667014360428, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7395833492279053, "step": 2849 }, { "clip_ratio": 0.0, "completion_length": 557.2125274658204, "epoch": 0.9121459433509361, "grad_norm": 0.24760982394218445, "kl": 0.21860564053058623, "learning_rate": 4.6521398247953543e-07, "loss": 0.0861, "reward": 1.7479166984558105, "reward_std": 0.1462639383971691, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7437500119209289, "step": 2850 }, { "clip_ratio": 0.0, "completion_length": 580.9354370117187, "epoch": 0.9124659945591295, "grad_norm": 0.1644178032875061, "kl": 0.2740326181054115, "learning_rate": 4.618507926076954e-07, "loss": 0.0977, "reward": 1.7671875357627869, "reward_std": 0.19867352321743964, "rewards/accuracy_reward": 0.06250000223517418, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7421875178813935, "step": 2851 }, { "clip_ratio": 0.0, "completion_length": 552.0396026611328, "epoch": 0.9127860457673228, "grad_norm": 0.22223533689975739, "kl": 0.24603908509016037, "learning_rate": 4.584995163113404e-07, "loss": 0.086, "reward": 1.8843750834465027, "reward_std": 0.18686745911836625, "rewards/accuracy_reward": 0.17500000596046447, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7406250238418579, "step": 2852 }, { "clip_ratio": 0.0, "completion_length": 554.7104431152344, "epoch": 0.913106096975516, "grad_norm": 0.08565722405910492, "kl": 0.2290005251765251, "learning_rate": 4.5516015777636535e-07, "loss": 0.0813, "reward": 1.9026042222976685, "reward_std": 0.23698803335428237, "rewards/accuracy_reward": 0.1916666716337204, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7421875178813935, "step": 2853 }, { "clip_ratio": 0.0, "completion_length": 590.058349609375, "epoch": 0.9134261481837094, "grad_norm": 0.22499686479568481, "kl": 0.3013485103845596, "learning_rate": 4.518327211737761e-07, "loss": 0.0715, "reward": 1.7885417222976685, "reward_std": 0.17250624895095826, "rewards/accuracy_reward": 0.09583333730697632, "rewards/format_reward": 0.9520833432674408, "rewards/tag_count_reward": 0.7406250059604644, "step": 2854 }, { "clip_ratio": 0.0, "completion_length": 579.8104431152344, "epoch": 0.9137461993919027, "grad_norm": 0.1874050498008728, "kl": 0.3433677464723587, "learning_rate": 4.4851721065969243e-07, "loss": 0.1284, "reward": 1.6703125357627868, "reward_std": 0.25159602984786034, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.9187500238418579, "rewards/tag_count_reward": 0.7348958551883698, "step": 2855 }, { "clip_ratio": 0.0, "completion_length": 541.7979400634765, "epoch": 0.914066250600096, "grad_norm": 0.13606959581375122, "kl": 0.28302004411816595, "learning_rate": 4.4521363037533627e-07, "loss": 0.0707, "reward": 1.7906250476837158, "reward_std": 0.12235096469521523, "rewards/accuracy_reward": 0.07500000316649676, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7489583492279053, "step": 2856 }, { "clip_ratio": 0.0, "completion_length": 547.6062683105469, "epoch": 0.9143863018082893, "grad_norm": 0.07938756793737411, "kl": 0.22487426400184632, "learning_rate": 4.4192198444702685e-07, "loss": 0.0844, "reward": 1.7760416984558105, "reward_std": 0.23371648490428926, "rewards/accuracy_reward": 0.08333333637565374, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7364583551883698, "step": 2857 }, { "clip_ratio": 0.0, "completion_length": 539.8208557128906, "epoch": 0.9147063530164826, "grad_norm": 0.17403903603553772, "kl": 0.39559953659772873, "learning_rate": 4.386422769861742e-07, "loss": 0.0982, "reward": 1.7104166984558105, "reward_std": 0.20936973839998246, "rewards/accuracy_reward": 0.018750000186264514, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7354166865348816, "step": 2858 }, { "clip_ratio": 0.0, "completion_length": 557.0604309082031, "epoch": 0.915026404224676, "grad_norm": 0.13922373950481415, "kl": 0.22795844152569772, "learning_rate": 4.353745120892838e-07, "loss": 0.0631, "reward": 1.7739583492279052, "reward_std": 0.1610909268260002, "rewards/accuracy_reward": 0.05625000018626451, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7427083551883698, "step": 2859 }, { "clip_ratio": 0.0, "completion_length": 561.4104431152343, "epoch": 0.9153464554328693, "grad_norm": 0.11690249294042587, "kl": 0.2956189580261707, "learning_rate": 4.3211869383793735e-07, "loss": 0.1005, "reward": 1.7656250596046448, "reward_std": 0.25438908860087395, "rewards/accuracy_reward": 0.07916666846722364, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7364583492279053, "step": 2860 }, { "clip_ratio": 0.0, "completion_length": 579.4208435058594, "epoch": 0.9156665066410625, "grad_norm": 0.10374096035957336, "kl": 0.22751567736268044, "learning_rate": 4.288748262987996e-07, "loss": 0.1048, "reward": 1.8015625596046447, "reward_std": 0.19006576761603355, "rewards/accuracy_reward": 0.09375000279396772, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7432291805744171, "step": 2861 }, { "clip_ratio": 0.0, "completion_length": 542.8062683105469, "epoch": 0.9159865578492559, "grad_norm": 0.09356549382209778, "kl": 0.20459693036973475, "learning_rate": 4.256429135236062e-07, "loss": 0.0595, "reward": 1.7729166984558105, "reward_std": 0.145799171179533, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7437500178813934, "step": 2862 }, { "clip_ratio": 0.0, "completion_length": 550.9521118164063, "epoch": 0.9163066090574492, "grad_norm": 0.23343823850154877, "kl": 0.3022413983941078, "learning_rate": 4.2242295954915913e-07, "loss": 0.0772, "reward": 1.8635416984558106, "reward_std": 0.15203743427991867, "rewards/accuracy_reward": 0.15000000651925802, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.744791692495346, "step": 2863 }, { "clip_ratio": 0.0, "completion_length": 556.5937652587891, "epoch": 0.9166266602656425, "grad_norm": 0.1239825040102005, "kl": 0.3159925784915686, "learning_rate": 4.1921496839732677e-07, "loss": 0.0643, "reward": 1.7963542342185974, "reward_std": 0.24162321984767915, "rewards/accuracy_reward": 0.09375000167638063, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7421875178813935, "step": 2864 }, { "clip_ratio": 0.0, "completion_length": 579.5729370117188, "epoch": 0.9169467114738358, "grad_norm": 0.19408008456230164, "kl": 0.3092687904834747, "learning_rate": 4.1601894407503507e-07, "loss": 0.0504, "reward": 1.7197916865348817, "reward_std": 0.15204674303531646, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7427083611488342, "step": 2865 }, { "clip_ratio": 0.0, "completion_length": 562.6166778564453, "epoch": 0.9172667626820291, "grad_norm": 0.18424807488918304, "kl": 0.47804224863648415, "learning_rate": 4.128348905742585e-07, "loss": 0.0842, "reward": 1.7911458730697631, "reward_std": 0.23824312388896943, "rewards/accuracy_reward": 0.10416667237877845, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7369791805744171, "step": 2866 }, { "clip_ratio": 0.0, "completion_length": 562.6479370117188, "epoch": 0.9175868138902225, "grad_norm": 0.08997310698032379, "kl": 0.28598271422088145, "learning_rate": 4.096628118720236e-07, "loss": 0.0907, "reward": 1.7640625357627868, "reward_std": 0.1785560056567192, "rewards/accuracy_reward": 0.052083336375653745, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7411458492279053, "step": 2867 }, { "clip_ratio": 0.0, "completion_length": 620.0666809082031, "epoch": 0.9179068650984158, "grad_norm": 0.1769360601902008, "kl": 0.3333309397101402, "learning_rate": 4.065027119303988e-07, "loss": 0.1033, "reward": 1.7739583730697632, "reward_std": 0.26832345873117447, "rewards/accuracy_reward": 0.09166666883975268, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7385416746139526, "step": 2868 }, { "clip_ratio": 0.0, "completion_length": 566.7500274658203, "epoch": 0.918226916306609, "grad_norm": 0.11235616356134415, "kl": 0.3151254206895828, "learning_rate": 4.0335459469649117e-07, "loss": 0.0981, "reward": 1.800520896911621, "reward_std": 0.20175526589155196, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7401041865348816, "step": 2869 }, { "clip_ratio": 0.0, "completion_length": 593.270849609375, "epoch": 0.9185469675148024, "grad_norm": 0.10434413701295853, "kl": 0.23269173577427865, "learning_rate": 4.002184641024409e-07, "loss": 0.0967, "reward": 1.7770833730697633, "reward_std": 0.25618855506181715, "rewards/accuracy_reward": 0.09166666828095912, "rewards/format_reward": 0.9437500298023224, "rewards/tag_count_reward": 0.7416666805744171, "step": 2870 }, { "clip_ratio": 0.0, "completion_length": 561.6979400634766, "epoch": 0.9188670187229957, "grad_norm": 0.09728308767080307, "kl": 0.21025248169898986, "learning_rate": 3.9709432406541125e-07, "loss": 0.0779, "reward": 1.8218750476837158, "reward_std": 0.15108426734805108, "rewards/accuracy_reward": 0.10833333842456341, "rewards/format_reward": 0.9708333611488342, "rewards/tag_count_reward": 0.7427083492279053, "step": 2871 }, { "clip_ratio": 0.0, "completion_length": 574.3146087646485, "epoch": 0.919187069931189, "grad_norm": 0.16741010546684265, "kl": 0.3041342481970787, "learning_rate": 3.9398217848759637e-07, "loss": 0.0879, "reward": 1.7890625476837159, "reward_std": 0.21354203149676323, "rewards/accuracy_reward": 0.0791666692122817, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7411458551883697, "step": 2872 }, { "clip_ratio": 0.0, "completion_length": 567.2562713623047, "epoch": 0.9195071211393823, "grad_norm": 0.1411246955394745, "kl": 0.23895582556724548, "learning_rate": 3.9088203125620563e-07, "loss": 0.0973, "reward": 1.7385416984558106, "reward_std": 0.2176542192697525, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7343750238418579, "step": 2873 }, { "clip_ratio": 0.0, "completion_length": 560.1104339599609, "epoch": 0.9198271723475756, "grad_norm": 0.20984359085559845, "kl": 0.35559300556778906, "learning_rate": 3.877938862434627e-07, "loss": 0.1038, "reward": 1.7473958611488343, "reward_std": 0.19333869963884354, "rewards/accuracy_reward": 0.05000000204890966, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.739062511920929, "step": 2874 }, { "clip_ratio": 0.0, "completion_length": 537.4083465576172, "epoch": 0.920147223555769, "grad_norm": 0.10066534578800201, "kl": 0.2031643271446228, "learning_rate": 3.847177473065955e-07, "loss": 0.0482, "reward": 1.8208333730697632, "reward_std": 0.13629631251096724, "rewards/accuracy_reward": 0.10833333749324084, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7458333432674408, "step": 2875 }, { "clip_ratio": 0.0, "completion_length": 565.5645935058594, "epoch": 0.9204672747639623, "grad_norm": 0.11685199290513992, "kl": 0.2875231482088566, "learning_rate": 3.816536182878416e-07, "loss": 0.0256, "reward": 1.7531250238418579, "reward_std": 0.09930408298969269, "rewards/accuracy_reward": 0.029166667722165585, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7489583492279053, "step": 2876 }, { "clip_ratio": 0.0, "completion_length": 575.2562683105468, "epoch": 0.9207873259721555, "grad_norm": 0.12749531865119934, "kl": 0.27901603281497955, "learning_rate": 3.786015030144352e-07, "loss": 0.0592, "reward": 1.8505208849906922, "reward_std": 0.17607783749699593, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7463541746139526, "step": 2877 }, { "clip_ratio": 0.0, "completion_length": 579.6729370117188, "epoch": 0.9211073771803489, "grad_norm": 0.1343710869550705, "kl": 0.22883844375610352, "learning_rate": 3.755614052986056e-07, "loss": 0.0537, "reward": 1.8072917222976685, "reward_std": 0.1575484722852707, "rewards/accuracy_reward": 0.08958333693444728, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7468750178813934, "step": 2878 }, { "clip_ratio": 0.0, "completion_length": 583.3687744140625, "epoch": 0.9214274283885422, "grad_norm": 0.14246773719787598, "kl": 0.3555714774876833, "learning_rate": 3.7253332893756877e-07, "loss": 0.1228, "reward": 1.7880208849906922, "reward_std": 0.2130596399307251, "rewards/accuracy_reward": 0.10625000316649676, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7338541805744171, "step": 2879 }, { "clip_ratio": 0.0, "completion_length": 561.958349609375, "epoch": 0.9217474795967355, "grad_norm": 0.1846427470445633, "kl": 0.2668366312980652, "learning_rate": 3.695172777135292e-07, "loss": 0.0932, "reward": 1.7718750476837157, "reward_std": 0.15560345873236656, "rewards/accuracy_reward": 0.06666666865348816, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7427083551883698, "step": 2880 }, { "clip_ratio": 0.0, "completion_length": 570.7458587646485, "epoch": 0.9220675308049288, "grad_norm": 0.13593432307243347, "kl": 0.46407483220100404, "learning_rate": 3.66513255393669e-07, "loss": 0.0988, "reward": 1.7625000596046447, "reward_std": 0.22477970719337464, "rewards/accuracy_reward": 0.07500000223517418, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7333333432674408, "step": 2881 }, { "clip_ratio": 0.0, "completion_length": 571.7854370117187, "epoch": 0.9223875820131221, "grad_norm": 0.08887147158384323, "kl": 0.20311668664216995, "learning_rate": 3.6352126573015013e-07, "loss": 0.0405, "reward": 1.7395833492279054, "reward_std": 0.13752839267253875, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.9770833551883698, "rewards/tag_count_reward": 0.7479166865348816, "step": 2882 }, { "clip_ratio": 0.0, "completion_length": 564.2562713623047, "epoch": 0.9227076332213154, "grad_norm": 0.17821405827999115, "kl": 0.2897450774908066, "learning_rate": 3.605413124600965e-07, "loss": 0.1086, "reward": 1.7807292103767396, "reward_std": 0.22551444172859192, "rewards/accuracy_reward": 0.07291666958481073, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7432291746139527, "step": 2883 }, { "clip_ratio": 0.0, "completion_length": 556.2875152587891, "epoch": 0.9230276844295088, "grad_norm": 0.17026633024215698, "kl": 0.2222075067460537, "learning_rate": 3.575733993056063e-07, "loss": 0.0778, "reward": 1.8151042103767394, "reward_std": 0.1940957099199295, "rewards/accuracy_reward": 0.10416666939854621, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7442708551883698, "step": 2884 }, { "clip_ratio": 0.0, "completion_length": 592.1583618164062, "epoch": 0.923347735637702, "grad_norm": 0.2168344110250473, "kl": 0.3349597044289112, "learning_rate": 3.546175299737342e-07, "loss": 0.0803, "reward": 1.7307291984558106, "reward_std": 0.17358548641204835, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7390625298023223, "step": 2885 }, { "clip_ratio": 0.0, "completion_length": 560.9458557128906, "epoch": 0.9236677868458953, "grad_norm": 0.2521231472492218, "kl": 0.24037815183401107, "learning_rate": 3.5167370815649694e-07, "loss": 0.0825, "reward": 1.7151041984558106, "reward_std": 0.19021971225738527, "rewards/accuracy_reward": 0.010416666977107525, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7442708551883698, "step": 2886 }, { "clip_ratio": 0.0, "completion_length": 541.1583526611328, "epoch": 0.9239878380540887, "grad_norm": 0.17703336477279663, "kl": 0.21621669009327887, "learning_rate": 3.4874193753085426e-07, "loss": 0.0605, "reward": 1.7875000715255738, "reward_std": 0.11906407549977302, "rewards/accuracy_reward": 0.07083333544433117, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7479166805744171, "step": 2887 }, { "clip_ratio": 0.0, "completion_length": 547.645849609375, "epoch": 0.9243078892622819, "grad_norm": 0.18615660071372986, "kl": 0.344948410987854, "learning_rate": 3.458222217587226e-07, "loss": 0.1063, "reward": 1.789062535762787, "reward_std": 0.25289190337061884, "rewards/accuracy_reward": 0.09375000093132257, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7390625178813934, "step": 2888 }, { "clip_ratio": 0.0, "completion_length": 551.3146057128906, "epoch": 0.9246279404704753, "grad_norm": 0.07631520926952362, "kl": 0.15430775843560696, "learning_rate": 3.4291456448695805e-07, "loss": 0.0496, "reward": 1.7645833611488342, "reward_std": 0.12003937363624573, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7416666805744171, "step": 2889 }, { "clip_ratio": 0.0, "completion_length": 591.5291870117187, "epoch": 0.9249479916786686, "grad_norm": 0.24916376173496246, "kl": 0.35432265847921374, "learning_rate": 3.4001896934735436e-07, "loss": 0.0911, "reward": 1.770312535762787, "reward_std": 0.21689439043402672, "rewards/accuracy_reward": 0.07708333395421504, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.736979192495346, "step": 2890 }, { "clip_ratio": 0.0, "completion_length": 554.0041870117187, "epoch": 0.9252680428868619, "grad_norm": 0.18258577585220337, "kl": 0.3797022372484207, "learning_rate": 3.3713543995663735e-07, "loss": 0.1418, "reward": 1.6942708611488342, "reward_std": 0.21223524063825608, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7401041924953461, "step": 2891 }, { "clip_ratio": 0.0, "completion_length": 558.5771026611328, "epoch": 0.9255880940950552, "grad_norm": 0.1253851056098938, "kl": 0.2872502990067005, "learning_rate": 3.34263979916466e-07, "loss": 0.0809, "reward": 1.8552083849906922, "reward_std": 0.2737518347799778, "rewards/accuracy_reward": 0.16458333786576987, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7364583492279053, "step": 2892 }, { "clip_ratio": 0.0, "completion_length": 541.6458587646484, "epoch": 0.9259081453032485, "grad_norm": 0.12900535762310028, "kl": 0.27396869882941244, "learning_rate": 3.314045928134224e-07, "loss": 0.1208, "reward": 1.9781250715255738, "reward_std": 0.2516678273677826, "rewards/accuracy_reward": 0.27708334028720855, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250238418579, "step": 2893 }, { "clip_ratio": 0.0, "completion_length": 559.3500122070312, "epoch": 0.9262281965114418, "grad_norm": 0.16405299305915833, "kl": 0.23438700921833516, "learning_rate": 3.2855728221900975e-07, "loss": 0.0605, "reward": 1.817708396911621, "reward_std": 0.11167410463094711, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.975000011920929, "rewards/tag_count_reward": 0.7427083492279053, "step": 2894 }, { "clip_ratio": 0.0, "completion_length": 569.8250091552734, "epoch": 0.9265482477196352, "grad_norm": 0.3562224507331848, "kl": 0.359239012748003, "learning_rate": 3.2572205168964645e-07, "loss": 0.126, "reward": 1.709375023841858, "reward_std": 0.20814426690340043, "rewards/accuracy_reward": 0.01458333358168602, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7364583611488342, "step": 2895 }, { "clip_ratio": 0.0, "completion_length": 576.6000183105468, "epoch": 0.9268682989278284, "grad_norm": 0.21366332471370697, "kl": 0.20416639670729636, "learning_rate": 3.2289890476665975e-07, "loss": 0.07, "reward": 1.7697917103767395, "reward_std": 0.19547061547636985, "rewards/accuracy_reward": 0.06875000186264515, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7385416865348816, "step": 2896 }, { "clip_ratio": 0.0, "completion_length": 582.0104309082031, "epoch": 0.9271883501360217, "grad_norm": 0.10463610291481018, "kl": 0.25862638279795647, "learning_rate": 3.200878449762901e-07, "loss": 0.0649, "reward": 1.8213542342185973, "reward_std": 0.20296233296394348, "rewards/accuracy_reward": 0.11666666995733976, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7442708432674408, "step": 2897 }, { "clip_ratio": 0.0, "completion_length": 554.3625244140625, "epoch": 0.9275084013442151, "grad_norm": 0.2624492645263672, "kl": 0.31345591209828855, "learning_rate": 3.172888758296755e-07, "loss": 0.1211, "reward": 1.7791666984558105, "reward_std": 0.2839872606098652, "rewards/accuracy_reward": 0.1000000024214387, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7333333551883697, "step": 2898 }, { "clip_ratio": 0.0, "completion_length": 554.6312652587891, "epoch": 0.9278284525524084, "grad_norm": 0.18744568526744843, "kl": 0.30883694961667063, "learning_rate": 3.145020008228539e-07, "loss": 0.1168, "reward": 1.8046875715255737, "reward_std": 0.16510328650474548, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7421875238418579, "step": 2899 }, { "clip_ratio": 0.0, "completion_length": 561.0708465576172, "epoch": 0.9281485037606017, "grad_norm": 0.10015768557786942, "kl": 0.2992133036255836, "learning_rate": 3.117272234367563e-07, "loss": 0.0921, "reward": 1.744270884990692, "reward_std": 0.18532090559601783, "rewards/accuracy_reward": 0.0458333345130086, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7442708492279053, "step": 2900 }, { "clip_ratio": 0.0, "completion_length": 574.2625122070312, "epoch": 0.928468554968795, "grad_norm": 0.14354941248893738, "kl": 0.27462932020425795, "learning_rate": 3.089645471372038e-07, "loss": 0.0729, "reward": 1.7692708730697633, "reward_std": 0.1999375715851784, "rewards/accuracy_reward": 0.08541666977107525, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7338541984558106, "step": 2901 }, { "clip_ratio": 0.0, "completion_length": 565.508349609375, "epoch": 0.9287886061769883, "grad_norm": 0.17410211265087128, "kl": 0.24624013304710388, "learning_rate": 3.0621397537490494e-07, "loss": 0.1219, "reward": 1.7942708611488343, "reward_std": 0.3087786689400673, "rewards/accuracy_reward": 0.10833333730697632, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 2902 }, { "clip_ratio": 0.0, "completion_length": 560.737515258789, "epoch": 0.9291086573851817, "grad_norm": 0.14404526352882385, "kl": 0.18631610609591007, "learning_rate": 3.0347551158544597e-07, "loss": 0.0563, "reward": 1.7703125119209289, "reward_std": 0.1441615879535675, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7453125238418579, "step": 2903 }, { "clip_ratio": 0.0, "completion_length": 532.439599609375, "epoch": 0.9294287085933749, "grad_norm": 0.20506027340888977, "kl": 0.37727788612246516, "learning_rate": 3.007491591892886e-07, "loss": 0.0967, "reward": 1.8562500596046447, "reward_std": 0.2165341705083847, "rewards/accuracy_reward": 0.1541666707023978, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7375000238418579, "step": 2904 }, { "clip_ratio": 0.0, "completion_length": 559.2062713623047, "epoch": 0.9297487598015682, "grad_norm": 0.18293797969818115, "kl": 0.20713808685541152, "learning_rate": 2.9803492159177103e-07, "loss": 0.071, "reward": 1.807812547683716, "reward_std": 0.18278513848781586, "rewards/accuracy_reward": 0.0875000013038516, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7432291805744171, "step": 2905 }, { "clip_ratio": 0.0, "completion_length": 555.0562713623046, "epoch": 0.9300688110097616, "grad_norm": 0.22001373767852783, "kl": 0.17172672897577285, "learning_rate": 2.953328021830981e-07, "loss": 0.0572, "reward": 1.825520896911621, "reward_std": 0.15396547242999076, "rewards/accuracy_reward": 0.11041667070239783, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7421875238418579, "step": 2906 }, { "clip_ratio": 0.0, "completion_length": 557.6729431152344, "epoch": 0.9303888622179549, "grad_norm": 0.09129034727811813, "kl": 0.2686162628233433, "learning_rate": 2.926428043383378e-07, "loss": 0.0988, "reward": 1.814583384990692, "reward_std": 0.2419304519891739, "rewards/accuracy_reward": 0.11875000353902579, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7395833551883697, "step": 2907 }, { "clip_ratio": 0.0, "completion_length": 575.1291931152343, "epoch": 0.9307089134261481, "grad_norm": 0.340289443731308, "kl": 0.6954551450908184, "learning_rate": 2.8996493141741686e-07, "loss": 0.1143, "reward": 1.7781250476837158, "reward_std": 0.28263785168528555, "rewards/accuracy_reward": 0.1062500037252903, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7343750238418579, "step": 2908 }, { "clip_ratio": 0.0, "completion_length": 571.7979339599609, "epoch": 0.9310289646343415, "grad_norm": 0.2295711785554886, "kl": 0.43438730686903, "learning_rate": 2.8729918676511983e-07, "loss": 0.0989, "reward": 1.8583333849906922, "reward_std": 0.2656490132212639, "rewards/accuracy_reward": 0.18541667014360427, "rewards/format_reward": 0.9354166924953461, "rewards/tag_count_reward": 0.7375000238418579, "step": 2909 }, { "clip_ratio": 0.0, "completion_length": 577.5687652587891, "epoch": 0.9313490158425348, "grad_norm": 0.1720622330904007, "kl": 0.35845495462417604, "learning_rate": 2.846455737110787e-07, "loss": 0.0689, "reward": 1.7369791865348816, "reward_std": 0.21831071525812148, "rewards/accuracy_reward": 0.05000000149011612, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7369791984558105, "step": 2910 }, { "clip_ratio": 0.0, "completion_length": 582.3021057128906, "epoch": 0.9316690670507282, "grad_norm": 0.16650277376174927, "kl": 0.35746832117438315, "learning_rate": 2.8200409556977894e-07, "loss": 0.0936, "reward": 1.8375000357627869, "reward_std": 0.25913550406694413, "rewards/accuracy_reward": 0.13958333656191826, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 2911 }, { "clip_ratio": 0.0, "completion_length": 563.5666870117187, "epoch": 0.9319891182589214, "grad_norm": 0.12058953940868378, "kl": 0.24439542815089227, "learning_rate": 2.7937475564054017e-07, "loss": 0.0821, "reward": 1.7796875357627868, "reward_std": 0.21224845796823502, "rewards/accuracy_reward": 0.0812500013038516, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7380208551883698, "step": 2912 }, { "clip_ratio": 0.0, "completion_length": 589.9521026611328, "epoch": 0.9323091694671147, "grad_norm": 0.21773386001586914, "kl": 0.35191044956445694, "learning_rate": 2.767575572075287e-07, "loss": 0.1179, "reward": 1.7239583730697632, "reward_std": 0.23633338287472724, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7406250178813935, "step": 2913 }, { "clip_ratio": 0.0, "completion_length": 549.6396057128907, "epoch": 0.9326292206753081, "grad_norm": 0.14750872552394867, "kl": 0.5395490519702435, "learning_rate": 2.74152503539743e-07, "loss": 0.0997, "reward": 1.7177083969116211, "reward_std": 0.19782040342688562, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7343750178813935, "step": 2914 }, { "clip_ratio": 0.0, "completion_length": 587.5583557128906, "epoch": 0.9329492718835014, "grad_norm": 0.10624227672815323, "kl": 0.4461661420762539, "learning_rate": 2.7155959789101127e-07, "loss": 0.1291, "reward": 1.7812500476837159, "reward_std": 0.28088073134422303, "rewards/accuracy_reward": 0.10416667088866234, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7333333611488342, "step": 2915 }, { "clip_ratio": 0.0, "completion_length": 553.4812683105469, "epoch": 0.9332693230916946, "grad_norm": 0.196004256606102, "kl": 0.20092077553272247, "learning_rate": 2.6897884349998735e-07, "loss": 0.0628, "reward": 1.8072917103767394, "reward_std": 0.1401705838739872, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7427083492279053, "step": 2916 }, { "clip_ratio": 0.0, "completion_length": 573.5687744140625, "epoch": 0.933589374299888, "grad_norm": 0.14856016635894775, "kl": 0.3332873769104481, "learning_rate": 2.6641024359015056e-07, "loss": 0.109, "reward": 1.8395833849906922, "reward_std": 0.2291702926158905, "rewards/accuracy_reward": 0.1395833358168602, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7416666865348815, "step": 2917 }, { "clip_ratio": 0.0, "completion_length": 587.2500183105469, "epoch": 0.9339094255080813, "grad_norm": 0.3066563010215759, "kl": 0.5219496801495552, "learning_rate": 2.638538013697956e-07, "loss": 0.1372, "reward": 1.7557292103767395, "reward_std": 0.24314365535974503, "rewards/accuracy_reward": 0.07916666977107525, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7348958551883698, "step": 2918 }, { "clip_ratio": 0.0, "completion_length": 572.4666809082031, "epoch": 0.9342294767162747, "grad_norm": 0.23431648313999176, "kl": 0.21682993099093437, "learning_rate": 2.613095200320359e-07, "loss": 0.0652, "reward": 1.8515625715255737, "reward_std": 0.1445869944989681, "rewards/accuracy_reward": 0.13958333637565373, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7411458492279053, "step": 2919 }, { "clip_ratio": 0.0, "completion_length": 589.3687683105469, "epoch": 0.9345495279244679, "grad_norm": 0.20319998264312744, "kl": 0.2534866757690907, "learning_rate": 2.587774027547918e-07, "loss": 0.0787, "reward": 1.7703125476837158, "reward_std": 0.2077922374010086, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7390625178813934, "step": 2920 }, { "clip_ratio": 0.0, "completion_length": 569.2833435058594, "epoch": 0.9348695791326612, "grad_norm": 0.18832314014434814, "kl": 0.4107868306338787, "learning_rate": 2.5625745270078775e-07, "loss": 0.0822, "reward": 1.7130208730697631, "reward_std": 0.20654768422245978, "rewards/accuracy_reward": 0.01666666679084301, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7380208551883698, "step": 2921 }, { "clip_ratio": 0.0, "completion_length": 563.7833557128906, "epoch": 0.9351896303408546, "grad_norm": 0.24442380666732788, "kl": 0.2872084707021713, "learning_rate": 2.5374967301755924e-07, "loss": 0.0997, "reward": 1.758333396911621, "reward_std": 0.2554009936749935, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7333333492279053, "step": 2922 }, { "clip_ratio": 0.0, "completion_length": 587.0437622070312, "epoch": 0.9355096815490479, "grad_norm": 0.1751808375120163, "kl": 0.44566518142819406, "learning_rate": 2.5125406683743417e-07, "loss": 0.096, "reward": 1.7348958849906921, "reward_std": 0.21916937381029128, "rewards/accuracy_reward": 0.03958333395421505, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7369791865348816, "step": 2923 }, { "clip_ratio": 0.0, "completion_length": 555.02294921875, "epoch": 0.9358297327572411, "grad_norm": 0.12745539844036102, "kl": 0.15233333893120288, "learning_rate": 2.487706372775345e-07, "loss": 0.0583, "reward": 1.7635417103767395, "reward_std": 0.11595178246498108, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9770833492279053, "rewards/tag_count_reward": 0.7447916865348816, "step": 2924 }, { "clip_ratio": 0.0, "completion_length": 570.4375244140625, "epoch": 0.9361497839654345, "grad_norm": 0.17469380795955658, "kl": 0.3449023649096489, "learning_rate": 2.4629938743977567e-07, "loss": 0.1054, "reward": 1.8369791984558106, "reward_std": 0.25577750355005263, "rewards/accuracy_reward": 0.15000000409781933, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.732812511920929, "step": 2925 }, { "clip_ratio": 0.0, "completion_length": 575.6416870117188, "epoch": 0.9364698351736278, "grad_norm": 0.29228219389915466, "kl": 0.5941868476569653, "learning_rate": 2.438403204108597e-07, "loss": 0.0752, "reward": 1.8057292222976684, "reward_std": 0.20368908420205117, "rewards/accuracy_reward": 0.10625000316649676, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7390625178813934, "step": 2926 }, { "clip_ratio": 0.0, "completion_length": 550.3604339599609, "epoch": 0.9367898863818211, "grad_norm": 0.30949291586875916, "kl": 0.20246201269328595, "learning_rate": 2.413934392622719e-07, "loss": 0.0582, "reward": 1.811458373069763, "reward_std": 0.15752314329147338, "rewards/accuracy_reward": 0.08750000260770321, "rewards/format_reward": 0.9750000178813935, "rewards/tag_count_reward": 0.7489583492279053, "step": 2927 }, { "clip_ratio": 0.0, "completion_length": 570.1041839599609, "epoch": 0.9371099375900144, "grad_norm": 0.2570835053920746, "kl": 0.24000799655914307, "learning_rate": 2.3895874705027635e-07, "loss": 0.0772, "reward": 1.8093750476837158, "reward_std": 0.16699628233909608, "rewards/accuracy_reward": 0.10833333637565375, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7385416865348816, "step": 2928 }, { "clip_ratio": 0.0, "completion_length": 553.5166870117188, "epoch": 0.9374299887982077, "grad_norm": 0.28607016801834106, "kl": 0.3831566788256168, "learning_rate": 2.3653624681591048e-07, "loss": 0.0714, "reward": 1.7671875476837158, "reward_std": 0.2298620417714119, "rewards/accuracy_reward": 0.06666666902601719, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7401041805744171, "step": 2929 }, { "clip_ratio": 0.0, "completion_length": 583.8125122070312, "epoch": 0.937750040006401, "grad_norm": 0.09761839359998703, "kl": 0.18521942049264908, "learning_rate": 2.3412594158498836e-07, "loss": 0.0764, "reward": 1.7520833492279053, "reward_std": 0.16688498705625535, "rewards/accuracy_reward": 0.05208333432674408, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7375000238418579, "step": 2930 }, { "clip_ratio": 0.0, "completion_length": 584.4187622070312, "epoch": 0.9380700912145943, "grad_norm": 0.43173423409461975, "kl": 0.29425476044416427, "learning_rate": 2.3172783436808844e-07, "loss": 0.1062, "reward": 1.7093750357627868, "reward_std": 0.22269544750452042, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9416666805744172, "rewards/tag_count_reward": 0.7343750238418579, "step": 2931 }, { "clip_ratio": 0.0, "completion_length": 570.581265258789, "epoch": 0.9383901424227876, "grad_norm": 0.16565969586372375, "kl": 0.277605714648962, "learning_rate": 2.2934192816055355e-07, "loss": 0.063, "reward": 1.8677083730697632, "reward_std": 0.20725997984409333, "rewards/accuracy_reward": 0.15833333786576986, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.7364583551883698, "step": 2932 }, { "clip_ratio": 0.0, "completion_length": 573.6312622070312, "epoch": 0.938710193630981, "grad_norm": 0.07633515447378159, "kl": 0.28674716129899025, "learning_rate": 2.2696822594248768e-07, "loss": 0.1155, "reward": 1.7755208730697631, "reward_std": 0.23790881559252738, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7359375119209289, "step": 2933 }, { "clip_ratio": 0.0, "completion_length": 590.2958435058594, "epoch": 0.9390302448391743, "grad_norm": 0.1412808895111084, "kl": 0.2938103273510933, "learning_rate": 2.2460673067875029e-07, "loss": 0.0979, "reward": 1.723437523841858, "reward_std": 0.18467547446489335, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7421875178813935, "step": 2934 }, { "clip_ratio": 0.0, "completion_length": 599.4854339599609, "epoch": 0.9393502960473675, "grad_norm": 0.17745190858840942, "kl": 0.3535077393054962, "learning_rate": 2.2225744531895632e-07, "loss": 0.094, "reward": 1.7895833730697632, "reward_std": 0.2649504989385605, "rewards/accuracy_reward": 0.09375000353902578, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7395833611488343, "step": 2935 }, { "clip_ratio": 0.0, "completion_length": 563.5416931152344, "epoch": 0.9396703472555609, "grad_norm": 0.23511171340942383, "kl": 0.43363613411784174, "learning_rate": 2.1992037279746746e-07, "loss": 0.0866, "reward": 1.7776042103767395, "reward_std": 0.2053623117506504, "rewards/accuracy_reward": 0.07500000279396772, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7380208432674408, "step": 2936 }, { "clip_ratio": 0.0, "completion_length": 553.0541870117188, "epoch": 0.9399903984637542, "grad_norm": 0.21815741062164307, "kl": 0.2682798236608505, "learning_rate": 2.1759551603339092e-07, "loss": 0.0642, "reward": 1.832812535762787, "reward_std": 0.2220204994082451, "rewards/accuracy_reward": 0.14375000409781932, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7369791805744171, "step": 2937 }, { "clip_ratio": 0.0, "completion_length": 536.4479278564453, "epoch": 0.9403104496719475, "grad_norm": 0.18351462483406067, "kl": 0.21441592164337636, "learning_rate": 2.1528287793057934e-07, "loss": 0.0714, "reward": 1.7739583730697632, "reward_std": 0.1786945417523384, "rewards/accuracy_reward": 0.07083333469927311, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.740625011920929, "step": 2938 }, { "clip_ratio": 0.0, "completion_length": 585.1896026611328, "epoch": 0.9406305008801408, "grad_norm": 0.20814381539821625, "kl": 0.31521559022367, "learning_rate": 2.129824613776188e-07, "loss": 0.0862, "reward": 1.707812523841858, "reward_std": 0.16487954929471016, "rewards/accuracy_reward": 0.00625, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7411458492279053, "step": 2939 }, { "clip_ratio": 0.0, "completion_length": 568.5250244140625, "epoch": 0.9409505520883341, "grad_norm": 0.14655859768390656, "kl": 0.24050994589924812, "learning_rate": 2.1069426924783532e-07, "loss": 0.0874, "reward": 1.8135416984558106, "reward_std": 0.22481777146458626, "rewards/accuracy_reward": 0.1041666705161333, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7447916865348816, "step": 2940 }, { "clip_ratio": 0.0, "completion_length": 595.1125091552734, "epoch": 0.9412706032965275, "grad_norm": 0.35549604892730713, "kl": 0.33449497222900393, "learning_rate": 2.0841830439928045e-07, "loss": 0.0939, "reward": 1.752083384990692, "reward_std": 0.2661015272140503, "rewards/accuracy_reward": 0.060416669212281705, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7416666924953461, "step": 2941 }, { "clip_ratio": 0.0, "completion_length": 565.5687774658203, "epoch": 0.9415906545047208, "grad_norm": 0.18642841279506683, "kl": 0.4748840194195509, "learning_rate": 2.06154569674738e-07, "loss": 0.0888, "reward": 1.8156250357627868, "reward_std": 0.19740560948848723, "rewards/accuracy_reward": 0.12708333730697632, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7385416865348816, "step": 2942 }, { "clip_ratio": 0.0, "completion_length": 595.5187561035157, "epoch": 0.941910705712914, "grad_norm": 0.1558806449174881, "kl": 0.5146293081343174, "learning_rate": 2.0390306790171398e-07, "loss": 0.0856, "reward": 1.7010416865348816, "reward_std": 0.23505208715796472, "rewards/accuracy_reward": 0.02708333432674408, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7364583551883698, "step": 2943 }, { "clip_ratio": 0.0, "completion_length": 575.527099609375, "epoch": 0.9422307569211074, "grad_norm": 0.07957801222801208, "kl": 0.28175764046609403, "learning_rate": 2.016638018924344e-07, "loss": 0.096, "reward": 1.7854167103767395, "reward_std": 0.2046665370464325, "rewards/accuracy_reward": 0.09583333637565375, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.737500011920929, "step": 2944 }, { "clip_ratio": 0.0, "completion_length": 572.7521057128906, "epoch": 0.9425508081293007, "grad_norm": 0.09544239938259125, "kl": 0.2202799826860428, "learning_rate": 1.9943677444384192e-07, "loss": 0.0668, "reward": 1.743750011920929, "reward_std": 0.17324633449316024, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7375000178813934, "step": 2945 }, { "clip_ratio": 0.0, "completion_length": 575.3604431152344, "epoch": 0.942870859337494, "grad_norm": 0.13867995142936707, "kl": 0.28282299637794495, "learning_rate": 1.9722198833759366e-07, "loss": 0.0822, "reward": 1.7552083611488343, "reward_std": 0.15816964358091354, "rewards/accuracy_reward": 0.04375000130385161, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7447916865348816, "step": 2946 }, { "clip_ratio": 0.0, "completion_length": 563.339599609375, "epoch": 0.9431909105456873, "grad_norm": 0.12346024811267853, "kl": 0.21222805231809616, "learning_rate": 1.95019446340059e-07, "loss": 0.0875, "reward": 1.7489583611488342, "reward_std": 0.18876290768384935, "rewards/accuracy_reward": 0.04791666828095913, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250178813935, "step": 2947 }, { "clip_ratio": 0.0, "completion_length": 548.9896026611328, "epoch": 0.9435109617538806, "grad_norm": 0.22070105373859406, "kl": 0.24906435757875442, "learning_rate": 1.928291512023106e-07, "loss": 0.0606, "reward": 1.7786458730697632, "reward_std": 0.16378662288188933, "rewards/accuracy_reward": 0.07291666995733977, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7411458492279053, "step": 2948 }, { "clip_ratio": 0.0, "completion_length": 545.9604278564453, "epoch": 0.943831012962074, "grad_norm": 0.25414496660232544, "kl": 0.45791466608643533, "learning_rate": 1.9065110566012347e-07, "loss": 0.0893, "reward": 1.806770884990692, "reward_std": 0.18329955637454987, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7380208551883698, "step": 2949 }, { "clip_ratio": 0.0, "completion_length": 557.8354400634765, "epoch": 0.9441510641702673, "grad_norm": 0.10287056118249893, "kl": 0.19068565890192984, "learning_rate": 1.8848531243397471e-07, "loss": 0.0742, "reward": 1.8005208611488341, "reward_std": 0.2313265398144722, "rewards/accuracy_reward": 0.09791666902601719, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7380208492279052, "step": 2950 }, { "clip_ratio": 0.0, "completion_length": 614.2437683105469, "epoch": 0.9444711153784605, "grad_norm": 0.31609681248664856, "kl": 0.4250298887491226, "learning_rate": 1.8633177422903824e-07, "loss": 0.1, "reward": 1.7354166984558106, "reward_std": 0.23066288232803345, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7395833611488343, "step": 2951 }, { "clip_ratio": 0.0, "completion_length": 578.0771118164063, "epoch": 0.9447911665866539, "grad_norm": 0.1550053358078003, "kl": 0.4375451445579529, "learning_rate": 1.8419049373517904e-07, "loss": 0.0789, "reward": 1.7687500357627868, "reward_std": 0.1940486691892147, "rewards/accuracy_reward": 0.06458333376795053, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7416666924953461, "step": 2952 }, { "clip_ratio": 0.0, "completion_length": 575.4896118164063, "epoch": 0.9451112177948472, "grad_norm": 0.12883856892585754, "kl": 0.2577236250042915, "learning_rate": 1.8206147362695214e-07, "loss": 0.0695, "reward": 1.7578125238418578, "reward_std": 0.2019563138484955, "rewards/accuracy_reward": 0.04583333395421505, "rewards/format_reward": 0.9687500298023224, "rewards/tag_count_reward": 0.7432291865348816, "step": 2953 }, { "clip_ratio": 0.0, "completion_length": 569.5916870117187, "epoch": 0.9454312690030405, "grad_norm": 0.2383638322353363, "kl": 0.3122972398996353, "learning_rate": 1.7994471656359814e-07, "loss": 0.1238, "reward": 1.7244791865348816, "reward_std": 0.24809951484203338, "rewards/accuracy_reward": 0.043750000186264515, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7328125357627868, "step": 2954 }, { "clip_ratio": 0.0, "completion_length": 582.689599609375, "epoch": 0.9457513202112338, "grad_norm": 0.16189329326152802, "kl": 0.22522822245955468, "learning_rate": 1.778402251890432e-07, "loss": 0.0805, "reward": 1.7411458611488342, "reward_std": 0.18320491760969163, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.743229192495346, "step": 2955 }, { "clip_ratio": 0.0, "completion_length": 602.5250122070313, "epoch": 0.9460713714194271, "grad_norm": 0.13715258240699768, "kl": 0.3096561312675476, "learning_rate": 1.7574800213189137e-07, "loss": 0.1016, "reward": 1.8203125476837159, "reward_std": 0.26009301394224166, "rewards/accuracy_reward": 0.1270833369344473, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7432291805744171, "step": 2956 }, { "clip_ratio": 0.0, "completion_length": 552.7750122070313, "epoch": 0.9463914226276204, "grad_norm": 0.08837933093309402, "kl": 0.2637955330312252, "learning_rate": 1.7366805000542108e-07, "loss": 0.0762, "reward": 1.7151042103767395, "reward_std": 0.19586375132203102, "rewards/accuracy_reward": 0.016666667349636555, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7401041865348816, "step": 2957 }, { "clip_ratio": 0.0, "completion_length": 565.7979339599609, "epoch": 0.9467114738358138, "grad_norm": 0.1346248984336853, "kl": 0.20087103992700578, "learning_rate": 1.7160037140758645e-07, "loss": 0.0779, "reward": 1.862500047683716, "reward_std": 0.1999605506658554, "rewards/accuracy_reward": 0.15625000409781933, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7395833611488343, "step": 2958 }, { "clip_ratio": 0.0, "completion_length": 582.5771118164063, "epoch": 0.947031525044007, "grad_norm": 0.17053905129432678, "kl": 0.2561337880790234, "learning_rate": 1.6954496892101047e-07, "loss": 0.0945, "reward": 1.7369791984558105, "reward_std": 0.17246170416474343, "rewards/accuracy_reward": 0.03958333451300859, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7411458432674408, "step": 2959 }, { "clip_ratio": 0.0, "completion_length": 573.6000305175781, "epoch": 0.9473515762522003, "grad_norm": 0.19663800299167633, "kl": 0.35930315852165223, "learning_rate": 1.6750184511298285e-07, "loss": 0.1183, "reward": 1.7598958611488342, "reward_std": 0.18631593957543374, "rewards/accuracy_reward": 0.06041666883975268, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7369791865348816, "step": 2960 }, { "clip_ratio": 0.0, "completion_length": 553.0625183105469, "epoch": 0.9476716274603937, "grad_norm": 0.1454787403345108, "kl": 0.19797628447413446, "learning_rate": 1.6547100253545889e-07, "loss": 0.0749, "reward": 1.8822917342185974, "reward_std": 0.1931290477514267, "rewards/accuracy_reward": 0.16041667126119136, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7427083432674408, "step": 2961 }, { "clip_ratio": 0.0, "completion_length": 546.8021026611328, "epoch": 0.947991678668587, "grad_norm": 0.18358418345451355, "kl": 0.22578486204147338, "learning_rate": 1.6345244372504842e-07, "loss": 0.0949, "reward": 1.7697916984558106, "reward_std": 0.18777953833341599, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7406250238418579, "step": 2962 }, { "clip_ratio": 0.0, "completion_length": 559.2937683105469, "epoch": 0.9483117298767803, "grad_norm": 0.1297026425600052, "kl": 0.3527979046106339, "learning_rate": 1.6144617120302351e-07, "loss": 0.0965, "reward": 1.8135417461395265, "reward_std": 0.2634100914001465, "rewards/accuracy_reward": 0.12916666977107524, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7385416805744172, "step": 2963 }, { "clip_ratio": 0.0, "completion_length": 559.8333557128906, "epoch": 0.9486317810849736, "grad_norm": 0.10099179297685623, "kl": 0.282944992184639, "learning_rate": 1.5945218747530855e-07, "loss": 0.0808, "reward": 1.7708333730697632, "reward_std": 0.21695861518383025, "rewards/accuracy_reward": 0.0791666703298688, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7395833492279053, "step": 2964 }, { "clip_ratio": 0.0, "completion_length": 584.195849609375, "epoch": 0.9489518322931669, "grad_norm": 0.1370352953672409, "kl": 0.34712174348533154, "learning_rate": 1.5747049503248013e-07, "loss": 0.0634, "reward": 1.7697916984558106, "reward_std": 0.1845792807638645, "rewards/accuracy_reward": 0.05416666939854622, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7447916746139527, "step": 2965 }, { "clip_ratio": 0.0, "completion_length": 560.9771179199219, "epoch": 0.9492718835013603, "grad_norm": 0.16078034043312073, "kl": 0.3685935214161873, "learning_rate": 1.5550109634975718e-07, "loss": 0.1126, "reward": 1.7260416984558105, "reward_std": 0.2447558268904686, "rewards/accuracy_reward": 0.04375000055879354, "rewards/format_reward": 0.9458333611488342, "rewards/tag_count_reward": 0.7364583492279053, "step": 2966 }, { "clip_ratio": 0.0, "completion_length": 552.1083526611328, "epoch": 0.9495919347095535, "grad_norm": 0.2678912878036499, "kl": 0.35141145139932634, "learning_rate": 1.5354399388700868e-07, "loss": 0.1208, "reward": 1.7854167222976685, "reward_std": 0.2304367497563362, "rewards/accuracy_reward": 0.10625000111758709, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7354166865348816, "step": 2967 }, { "clip_ratio": 0.0, "completion_length": 551.5291870117187, "epoch": 0.9499119859177468, "grad_norm": 0.20223970711231232, "kl": 0.3309009000658989, "learning_rate": 1.5159919008874368e-07, "loss": 0.1449, "reward": 1.8500000476837157, "reward_std": 0.24914255663752555, "rewards/accuracy_reward": 0.166666672937572, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7354166805744171, "step": 2968 }, { "clip_ratio": 0.0, "completion_length": 563.0000183105469, "epoch": 0.9502320371259402, "grad_norm": 0.10167260468006134, "kl": 0.17943341620266437, "learning_rate": 1.4966668738410905e-07, "loss": 0.0396, "reward": 1.8109375357627868, "reward_std": 0.13079179599881172, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.9770833373069763, "rewards/tag_count_reward": 0.7484375059604644, "step": 2969 }, { "clip_ratio": 0.0, "completion_length": 574.5729339599609, "epoch": 0.9505520883341335, "grad_norm": 0.11473032832145691, "kl": 0.2629383150488138, "learning_rate": 1.477464881868862e-07, "loss": 0.0789, "reward": 1.8244792222976685, "reward_std": 0.1891520008444786, "rewards/accuracy_reward": 0.12083333693444728, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7411458432674408, "step": 2970 }, { "clip_ratio": 0.0, "completion_length": 553.7437713623046, "epoch": 0.9508721395423267, "grad_norm": 0.12449677288532257, "kl": 0.2245940238237381, "learning_rate": 1.458385948954899e-07, "loss": 0.0666, "reward": 1.8380208969116212, "reward_std": 0.19873855113983155, "rewards/accuracy_reward": 0.13333333805203437, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7401041805744171, "step": 2971 }, { "clip_ratio": 0.0, "completion_length": 543.5583557128906, "epoch": 0.9511921907505201, "grad_norm": 0.29751840233802795, "kl": 0.4296580046415329, "learning_rate": 1.4394300989296618e-07, "loss": 0.1319, "reward": 1.7692708730697633, "reward_std": 0.23732186257839202, "rewards/accuracy_reward": 0.08750000204890966, "rewards/format_reward": 0.9395833432674408, "rewards/tag_count_reward": 0.7421875119209289, "step": 2972 }, { "clip_ratio": 0.0, "completion_length": 535.5562683105469, "epoch": 0.9515122419587134, "grad_norm": 0.11807762831449509, "kl": 0.23386535197496414, "learning_rate": 1.4205973554698548e-07, "loss": 0.0742, "reward": 1.7604166865348816, "reward_std": 0.2106850653886795, "rewards/accuracy_reward": 0.0541666679084301, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7416666865348815, "step": 2973 }, { "clip_ratio": 0.0, "completion_length": 580.064599609375, "epoch": 0.9518322931669067, "grad_norm": 0.14003440737724304, "kl": 0.21622378900647163, "learning_rate": 1.4018877420983956e-07, "loss": 0.0603, "reward": 1.7697917222976685, "reward_std": 0.18948814198374747, "rewards/accuracy_reward": 0.05625000167638063, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7427083432674408, "step": 2974 }, { "clip_ratio": 0.0, "completion_length": 538.5562561035156, "epoch": 0.9521523443751, "grad_norm": 0.10616900771856308, "kl": 0.22268827855587006, "learning_rate": 1.383301282184446e-07, "loss": 0.089, "reward": 1.7739583611488343, "reward_std": 0.19601852148771287, "rewards/accuracy_reward": 0.06250000316649676, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7447916805744171, "step": 2975 }, { "clip_ratio": 0.0, "completion_length": 565.8604309082032, "epoch": 0.9524723955832933, "grad_norm": 0.14727354049682617, "kl": 0.3326699022203684, "learning_rate": 1.3648379989433135e-07, "loss": 0.1121, "reward": 1.7796875476837157, "reward_std": 0.23153150603175163, "rewards/accuracy_reward": 0.09791666977107524, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.735937523841858, "step": 2976 }, { "clip_ratio": 0.0, "completion_length": 555.3062683105469, "epoch": 0.9527924467914867, "grad_norm": 0.1879003345966339, "kl": 0.24564929269254207, "learning_rate": 1.3464979154364844e-07, "loss": 0.074, "reward": 1.8286458730697632, "reward_std": 0.17841232642531396, "rewards/accuracy_reward": 0.12083333730697632, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7390625178813934, "step": 2977 }, { "clip_ratio": 0.0, "completion_length": 553.3708587646485, "epoch": 0.9531124979996799, "grad_norm": 0.24627186357975006, "kl": 0.2939072445034981, "learning_rate": 1.328281054571534e-07, "loss": 0.1061, "reward": 1.7781250715255736, "reward_std": 0.23098910599946976, "rewards/accuracy_reward": 0.08750000372529029, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7364583551883698, "step": 2978 }, { "clip_ratio": 0.0, "completion_length": 564.483349609375, "epoch": 0.9534325492078732, "grad_norm": 0.11084544658660889, "kl": 0.3061968058347702, "learning_rate": 1.3101874391021285e-07, "loss": 0.0848, "reward": 1.8708333849906922, "reward_std": 0.20706724524497985, "rewards/accuracy_reward": 0.17500000596046447, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7354166865348816, "step": 2979 }, { "clip_ratio": 0.0, "completion_length": 564.2666809082032, "epoch": 0.9537526004160666, "grad_norm": 0.16704536974430084, "kl": 0.42880779802799224, "learning_rate": 1.2922170916280118e-07, "loss": 0.0408, "reward": 1.8218750476837158, "reward_std": 0.13136814534664154, "rewards/accuracy_reward": 0.10625000316649676, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.7427083551883698, "step": 2980 }, { "clip_ratio": 0.0, "completion_length": 580.6833618164062, "epoch": 0.9540726516242599, "grad_norm": 0.18254122138023376, "kl": 0.3421017203480005, "learning_rate": 1.274370034594974e-07, "loss": 0.1099, "reward": 1.7604167222976685, "reward_std": 0.24414713978767394, "rewards/accuracy_reward": 0.0791666692122817, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7375000178813934, "step": 2981 }, { "clip_ratio": 0.0, "completion_length": 561.4562683105469, "epoch": 0.9543927028324531, "grad_norm": 0.18417489528656006, "kl": 0.282155305147171, "learning_rate": 1.2566462902947496e-07, "loss": 0.0854, "reward": 1.817708396911621, "reward_std": 0.24376452192664147, "rewards/accuracy_reward": 0.13125000540167092, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.7406250298023224, "step": 2982 }, { "clip_ratio": 0.0, "completion_length": 574.7604431152344, "epoch": 0.9547127540406465, "grad_norm": 0.21982231736183167, "kl": 0.3524301677942276, "learning_rate": 1.2390458808651085e-07, "loss": 0.0805, "reward": 1.8322916984558106, "reward_std": 0.29580608904361727, "rewards/accuracy_reward": 0.14166667107492686, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7343750238418579, "step": 2983 }, { "clip_ratio": 0.0, "completion_length": 559.364599609375, "epoch": 0.9550328052488398, "grad_norm": 0.18464107811450958, "kl": 0.2895275134593248, "learning_rate": 1.2215688282897542e-07, "loss": 0.1011, "reward": 1.7953125715255738, "reward_std": 0.21910280585289002, "rewards/accuracy_reward": 0.1083333345130086, "rewards/format_reward": 0.9520833432674408, "rewards/tag_count_reward": 0.7348958492279053, "step": 2984 }, { "clip_ratio": 0.0, "completion_length": 564.8750152587891, "epoch": 0.9553528564570332, "grad_norm": 0.2103576511144638, "kl": 0.48140868097543715, "learning_rate": 1.2042151543983028e-07, "loss": 0.0708, "reward": 1.8223958730697631, "reward_std": 0.18225472569465637, "rewards/accuracy_reward": 0.11458333879709244, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.739062511920929, "step": 2985 }, { "clip_ratio": 0.0, "completion_length": 560.4375183105469, "epoch": 0.9556729076652264, "grad_norm": 0.10476110875606537, "kl": 0.3020617179572582, "learning_rate": 1.186984880866271e-07, "loss": 0.0947, "reward": 1.784375047683716, "reward_std": 0.20616189390420914, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7406250238418579, "step": 2986 }, { "clip_ratio": 0.0, "completion_length": 577.6562744140625, "epoch": 0.9559929588734197, "grad_norm": 0.1183815449476242, "kl": 0.30500880554318427, "learning_rate": 1.1698780292150325e-07, "loss": 0.0719, "reward": 1.864583384990692, "reward_std": 0.2061496764421463, "rewards/accuracy_reward": 0.15625000447034837, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7416666865348815, "step": 2987 }, { "clip_ratio": 0.0, "completion_length": 603.670849609375, "epoch": 0.9563130100816131, "grad_norm": 0.2120182067155838, "kl": 0.2620538957417011, "learning_rate": 1.1528946208118286e-07, "loss": 0.0741, "reward": 1.7244791984558105, "reward_std": 0.19255055412650107, "rewards/accuracy_reward": 0.02083333358168602, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7411458551883697, "step": 2988 }, { "clip_ratio": 0.0, "completion_length": 559.0458587646484, "epoch": 0.9566330612898064, "grad_norm": 0.18735171854496002, "kl": 0.3196378767490387, "learning_rate": 1.1360346768696907e-07, "loss": 0.0928, "reward": 1.7416666984558105, "reward_std": 0.24244018495082856, "rewards/accuracy_reward": 0.05833333563059569, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7375000298023224, "step": 2989 }, { "clip_ratio": 0.0, "completion_length": 570.8541931152344, "epoch": 0.9569531124979996, "grad_norm": 0.09267427027225494, "kl": 0.17590968012809755, "learning_rate": 1.11929821844744e-07, "loss": 0.0771, "reward": 1.7635416984558105, "reward_std": 0.16972372829914092, "rewards/accuracy_reward": 0.05000000111758709, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7427083492279053, "step": 2990 }, { "clip_ratio": 0.0, "completion_length": 547.9312530517578, "epoch": 0.957273163706193, "grad_norm": 0.12013649940490723, "kl": 0.2494128279387951, "learning_rate": 1.1026852664496656e-07, "loss": 0.0976, "reward": 1.784375047683716, "reward_std": 0.1636178210377693, "rewards/accuracy_reward": 0.0729166679084301, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7427083492279053, "step": 2991 }, { "clip_ratio": 0.0, "completion_length": 560.4687683105469, "epoch": 0.9575932149143863, "grad_norm": 0.14498887956142426, "kl": 0.18977786004543304, "learning_rate": 1.0861958416266805e-07, "loss": 0.0593, "reward": 1.8005208730697633, "reward_std": 0.19197710752487182, "rewards/accuracy_reward": 0.0854166679084301, "rewards/format_reward": 0.9687500238418579, "rewards/tag_count_reward": 0.7463541805744172, "step": 2992 }, { "clip_ratio": 0.0, "completion_length": 572.1187683105469, "epoch": 0.9579132661225797, "grad_norm": 0.09392816573381424, "kl": 0.33596227318048477, "learning_rate": 1.0698299645745203e-07, "loss": 0.0888, "reward": 1.7036458611488343, "reward_std": 0.21316526755690574, "rewards/accuracy_reward": 0.018750000186264514, "rewards/format_reward": 0.9479166805744171, "rewards/tag_count_reward": 0.7369791865348816, "step": 2993 }, { "clip_ratio": 0.0, "completion_length": 548.0562622070313, "epoch": 0.9582333173307729, "grad_norm": 0.11171252280473709, "kl": 0.22640425711870193, "learning_rate": 1.0535876557349111e-07, "loss": 0.0706, "reward": 1.7869792103767395, "reward_std": 0.18889242559671401, "rewards/accuracy_reward": 0.07500000298023224, "rewards/format_reward": 0.9729166924953461, "rewards/tag_count_reward": 0.7390625178813934, "step": 2994 }, { "clip_ratio": 0.0, "completion_length": 571.7166778564454, "epoch": 0.9585533685389662, "grad_norm": 0.10955782979726791, "kl": 0.22637251615524293, "learning_rate": 1.0374689353952027e-07, "loss": 0.057, "reward": 1.810937523841858, "reward_std": 0.16208795085549355, "rewards/accuracy_reward": 0.10000000204890966, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7401041865348816, "step": 2995 }, { "clip_ratio": 0.0, "completion_length": 542.2229400634766, "epoch": 0.9588734197471596, "grad_norm": 0.285765677690506, "kl": 0.27739047557115554, "learning_rate": 1.0214738236884014e-07, "loss": 0.0849, "reward": 1.8604167103767395, "reward_std": 0.19695503115653992, "rewards/accuracy_reward": 0.15625000316649676, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7395833551883697, "step": 2996 }, { "clip_ratio": 0.0, "completion_length": 585.2750183105469, "epoch": 0.9591934709553529, "grad_norm": 0.2077830582857132, "kl": 0.36373098865151404, "learning_rate": 1.0056023405931259e-07, "loss": 0.0933, "reward": 1.742187535762787, "reward_std": 0.2123005896806717, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7359375178813934, "step": 2997 }, { "clip_ratio": 0.0, "completion_length": 555.8916839599609, "epoch": 0.9595135221635461, "grad_norm": 0.1561278998851776, "kl": 0.2940549574792385, "learning_rate": 9.898545059335852e-08, "loss": 0.0842, "reward": 1.717187523841858, "reward_std": 0.1930335447192192, "rewards/accuracy_reward": 0.022916667349636554, "rewards/format_reward": 0.956250011920929, "rewards/tag_count_reward": 0.7380208492279052, "step": 2998 }, { "clip_ratio": 0.0, "completion_length": 572.1375213623047, "epoch": 0.9598335733717395, "grad_norm": 0.10308010131120682, "kl": 0.22390391640365123, "learning_rate": 9.742303393795005e-08, "loss": 0.0928, "reward": 1.8218750476837158, "reward_std": 0.19930132627487182, "rewards/accuracy_reward": 0.12500000409781933, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7364583492279053, "step": 2999 }, { "clip_ratio": 0.0, "completion_length": 591.5979370117187, "epoch": 0.9601536245799328, "grad_norm": 0.16804414987564087, "kl": 0.21638592407107354, "learning_rate": 9.587298604461614e-08, "loss": 0.0584, "reward": 1.7817708849906921, "reward_std": 0.1360452577471733, "rewards/accuracy_reward": 0.07291666865348816, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7463541865348816, "step": 3000 }, { "clip_ratio": 0.0, "completion_length": 543.927099609375, "epoch": 0.9604736757881261, "grad_norm": 0.09598581492900848, "kl": 0.1559834960848093, "learning_rate": 9.433530884943698e-08, "loss": 0.0599, "reward": 1.767708384990692, "reward_std": 0.12271808385848999, "rewards/accuracy_reward": 0.04583333469927311, "rewards/format_reward": 0.9770833432674408, "rewards/tag_count_reward": 0.7447916865348816, "step": 3001 }, { "clip_ratio": 0.0, "completion_length": 547.9541900634765, "epoch": 0.9607937269963194, "grad_norm": 0.12407588958740234, "kl": 0.360856419801712, "learning_rate": 9.281000427304066e-08, "loss": 0.069, "reward": 1.8088541984558106, "reward_std": 0.20785682201385497, "rewards/accuracy_reward": 0.10833333637565375, "rewards/format_reward": 0.9583333611488343, "rewards/tag_count_reward": 0.7421875298023224, "step": 3002 }, { "clip_ratio": 0.0, "completion_length": 572.9437713623047, "epoch": 0.9611137782045127, "grad_norm": 0.12733730673789978, "kl": 0.2946822591125965, "learning_rate": 9.129707422059986e-08, "loss": 0.0782, "reward": 1.7244791984558105, "reward_std": 0.16163463965058328, "rewards/accuracy_reward": 0.018750000558793545, "rewards/format_reward": 0.9625000119209289, "rewards/tag_count_reward": 0.7432291865348816, "step": 3003 }, { "clip_ratio": 0.0, "completion_length": 537.0750152587891, "epoch": 0.961433829412706, "grad_norm": 0.3727104365825653, "kl": 0.2450065303593874, "learning_rate": 8.979652058183185e-08, "loss": 0.1028, "reward": 1.8244792222976685, "reward_std": 0.22431774139404298, "rewards/accuracy_reward": 0.11875000149011612, "rewards/format_reward": 0.9645833432674408, "rewards/tag_count_reward": 0.7411458551883697, "step": 3004 }, { "clip_ratio": 0.0, "completion_length": 570.4645965576171, "epoch": 0.9617538806208994, "grad_norm": 0.12149068713188171, "kl": 0.20398061200976372, "learning_rate": 8.830834523099518e-08, "loss": 0.0679, "reward": 1.773437535762787, "reward_std": 0.1679071843624115, "rewards/accuracy_reward": 0.0645833358168602, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7442708492279053, "step": 3005 }, { "clip_ratio": 0.0, "completion_length": 581.0854370117188, "epoch": 0.9620739318290926, "grad_norm": 0.1221243143081665, "kl": 0.39972667805850504, "learning_rate": 8.683255002688962e-08, "loss": 0.113, "reward": 1.7072916865348815, "reward_std": 0.22407453507184982, "rewards/accuracy_reward": 0.01666666716337204, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7385416865348816, "step": 3006 }, { "clip_ratio": 0.0, "completion_length": 521.8416870117187, "epoch": 0.962393983037286, "grad_norm": 0.12119229882955551, "kl": 0.21771190762519838, "learning_rate": 8.536913681284731e-08, "loss": 0.0849, "reward": 1.8875000596046447, "reward_std": 0.2431383326649666, "rewards/accuracy_reward": 0.18958333935588598, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7395833551883697, "step": 3007 }, { "clip_ratio": 0.0, "completion_length": 537.3979278564453, "epoch": 0.9627140342454793, "grad_norm": 0.36122357845306396, "kl": 0.2558132287114859, "learning_rate": 8.391810741673722e-08, "loss": 0.0811, "reward": 1.783333384990692, "reward_std": 0.20741451680660247, "rewards/accuracy_reward": 0.07708333544433117, "rewards/format_reward": 0.9645833432674408, "rewards/tag_count_reward": 0.7416666865348815, "step": 3008 }, { "clip_ratio": 0.0, "completion_length": 519.7666809082032, "epoch": 0.9630340854536726, "grad_norm": 0.22902488708496094, "kl": 0.29923873767256737, "learning_rate": 8.24794636509596e-08, "loss": 0.0986, "reward": 1.8473958611488341, "reward_std": 0.23173879757523536, "rewards/accuracy_reward": 0.1416666716337204, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7411458551883697, "step": 3009 }, { "clip_ratio": 0.0, "completion_length": 570.310433959961, "epoch": 0.9633541366618659, "grad_norm": 0.22059831023216248, "kl": 0.2688716005533934, "learning_rate": 8.105320731244703e-08, "loss": 0.0727, "reward": 1.8421875715255738, "reward_std": 0.18312807828187944, "rewards/accuracy_reward": 0.12708333563059568, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7442708492279053, "step": 3010 }, { "clip_ratio": 0.0, "completion_length": 570.7916961669922, "epoch": 0.9636741878700592, "grad_norm": 0.36360955238342285, "kl": 0.4199592448771, "learning_rate": 7.963934018265562e-08, "loss": 0.0867, "reward": 1.7635416984558105, "reward_std": 0.23115942478179932, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.9375000119209289, "rewards/tag_count_reward": 0.7343750059604645, "step": 3011 }, { "clip_ratio": 0.0, "completion_length": 596.1250183105469, "epoch": 0.9639942390782525, "grad_norm": 0.22022384405136108, "kl": 0.3199925169348717, "learning_rate": 7.823786402756827e-08, "loss": 0.1118, "reward": 1.7937500715255736, "reward_std": 0.23845134377479554, "rewards/accuracy_reward": 0.10416666995733977, "rewards/format_reward": 0.9479166984558105, "rewards/tag_count_reward": 0.7416666924953461, "step": 3012 }, { "clip_ratio": 0.0, "completion_length": 589.6333435058593, "epoch": 0.9643142902864459, "grad_norm": 0.15807892382144928, "kl": 0.33093359544873235, "learning_rate": 7.684878059769363e-08, "loss": 0.102, "reward": 1.794270884990692, "reward_std": 0.2638327829539776, "rewards/accuracy_reward": 0.11875000111758709, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.735937523841858, "step": 3013 }, { "clip_ratio": 0.0, "completion_length": 569.2354339599609, "epoch": 0.9646343414946391, "grad_norm": 0.13306574523448944, "kl": 0.27508914321660993, "learning_rate": 7.547209162805824e-08, "loss": 0.0831, "reward": 1.820833396911621, "reward_std": 0.2090136304497719, "rewards/accuracy_reward": 0.11458333749324083, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7416666865348815, "step": 3014 }, { "clip_ratio": 0.0, "completion_length": 549.6041900634766, "epoch": 0.9649543927028325, "grad_norm": 0.10513435304164886, "kl": 0.2855644281953573, "learning_rate": 7.410779883820663e-08, "loss": 0.0818, "reward": 1.765625035762787, "reward_std": 0.17442589700222016, "rewards/accuracy_reward": 0.05625000204890966, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.740625011920929, "step": 3015 }, { "clip_ratio": 0.0, "completion_length": 555.6416870117188, "epoch": 0.9652744439110258, "grad_norm": 0.11084417253732681, "kl": 0.26390017867088317, "learning_rate": 7.275590393220456e-08, "loss": 0.0724, "reward": 1.7838541865348816, "reward_std": 0.19236919954419135, "rewards/accuracy_reward": 0.08125000242143869, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7421875178813935, "step": 3016 }, { "clip_ratio": 0.0, "completion_length": 533.6958526611328, "epoch": 0.965594495119219, "grad_norm": 0.24113225936889648, "kl": 0.3328822206705809, "learning_rate": 7.141640859862576e-08, "loss": 0.0785, "reward": 1.7536458730697633, "reward_std": 0.1738430380821228, "rewards/accuracy_reward": 0.04375000055879354, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7390625178813934, "step": 3017 }, { "clip_ratio": 0.0, "completion_length": 551.5562683105469, "epoch": 0.9659145463274124, "grad_norm": 0.21408711373806, "kl": 0.43628372699022294, "learning_rate": 7.0089314510563e-08, "loss": 0.1153, "reward": 1.7932292103767395, "reward_std": 0.22712324783205987, "rewards/accuracy_reward": 0.1000000037252903, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7369791805744171, "step": 3018 }, { "clip_ratio": 0.0, "completion_length": 591.3750183105469, "epoch": 0.9662345975356057, "grad_norm": 0.247038334608078, "kl": 0.26444780342280866, "learning_rate": 6.877462332561479e-08, "loss": 0.0754, "reward": 1.7697916865348815, "reward_std": 0.18048504441976548, "rewards/accuracy_reward": 0.07291666772216558, "rewards/format_reward": 0.9541666746139527, "rewards/tag_count_reward": 0.7427083432674408, "step": 3019 }, { "clip_ratio": 0.0, "completion_length": 551.7625244140625, "epoch": 0.966554648743799, "grad_norm": 0.10298003256320953, "kl": 0.23233777694404126, "learning_rate": 6.747233668588981e-08, "loss": 0.0442, "reward": 1.8005208611488341, "reward_std": 0.13236782178282738, "rewards/accuracy_reward": 0.07916666977107525, "rewards/format_reward": 0.9791666805744171, "rewards/tag_count_reward": 0.7421875119209289, "step": 3020 }, { "clip_ratio": 0.0, "completion_length": 552.0062652587891, "epoch": 0.9668746999519923, "grad_norm": 0.1813294142484665, "kl": 0.2319230657070875, "learning_rate": 6.618245621800135e-08, "loss": 0.0954, "reward": 1.8520833849906921, "reward_std": 0.2161620169878006, "rewards/accuracy_reward": 0.1458333360031247, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7395833611488343, "step": 3021 }, { "clip_ratio": 0.0, "completion_length": 578.5916809082031, "epoch": 0.9671947511601856, "grad_norm": 0.1469898223876953, "kl": 0.2930970214307308, "learning_rate": 6.49049835330684e-08, "loss": 0.1043, "reward": 1.709375023841858, "reward_std": 0.20650226771831512, "rewards/accuracy_reward": 0.020833334326744078, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7364583492279053, "step": 3022 }, { "clip_ratio": 0.0, "completion_length": 587.5625183105469, "epoch": 0.967514802368379, "grad_norm": 0.1649038940668106, "kl": 0.3373178992420435, "learning_rate": 6.36399202267135e-08, "loss": 0.0835, "reward": 1.7692708849906922, "reward_std": 0.20760041624307632, "rewards/accuracy_reward": 0.07291666809469462, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7359375178813934, "step": 3023 }, { "clip_ratio": 0.0, "completion_length": 547.9000244140625, "epoch": 0.9678348535765723, "grad_norm": 0.1377795785665512, "kl": 0.3631039060652256, "learning_rate": 6.23872678790538e-08, "loss": 0.0643, "reward": 1.7510417103767395, "reward_std": 0.21666239500045775, "rewards/accuracy_reward": 0.05625000260770321, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7385416805744172, "step": 3024 }, { "clip_ratio": 0.0, "completion_length": 572.9229370117188, "epoch": 0.9681549047847655, "grad_norm": 0.2074446827173233, "kl": 0.3897064059972763, "learning_rate": 6.114702805471107e-08, "loss": 0.1027, "reward": 1.7135416865348816, "reward_std": 0.22525653690099717, "rewards/accuracy_reward": 0.02291666716337204, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7385416865348816, "step": 3025 }, { "clip_ratio": 0.0, "completion_length": 570.4541809082032, "epoch": 0.9684749559929589, "grad_norm": 0.10851828008890152, "kl": 0.273499009013176, "learning_rate": 5.991920230279946e-08, "loss": 0.0826, "reward": 1.7744792103767395, "reward_std": 0.18417903929948806, "rewards/accuracy_reward": 0.08125000298023224, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7390625238418579, "step": 3026 }, { "clip_ratio": 0.0, "completion_length": 559.989599609375, "epoch": 0.9687950072011522, "grad_norm": 0.0974067747592926, "kl": 0.3477734237909317, "learning_rate": 5.870379215692778e-08, "loss": 0.1281, "reward": 1.8484375596046447, "reward_std": 0.24913154244422914, "rewards/accuracy_reward": 0.15833333786576986, "rewards/format_reward": 0.9520833611488342, "rewards/tag_count_reward": 0.7380208551883698, "step": 3027 }, { "clip_ratio": 0.0, "completion_length": 549.3396026611329, "epoch": 0.9691150584093455, "grad_norm": 0.10124637931585312, "kl": 0.31874447837471964, "learning_rate": 5.750079913519835e-08, "loss": 0.1191, "reward": 1.804687535762787, "reward_std": 0.23008078709244728, "rewards/accuracy_reward": 0.11250000204890967, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7359375178813934, "step": 3028 }, { "clip_ratio": 0.0, "completion_length": 585.7896057128906, "epoch": 0.9694351096175388, "grad_norm": 0.32607904076576233, "kl": 0.19704431369900705, "learning_rate": 5.6310224740202536e-08, "loss": 0.0759, "reward": 1.7203125476837158, "reward_std": 0.17980255410075188, "rewards/accuracy_reward": 0.01458333395421505, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7411458432674408, "step": 3029 }, { "clip_ratio": 0.0, "completion_length": 566.6291809082031, "epoch": 0.9697551608257321, "grad_norm": 0.13909195363521576, "kl": 0.36173166893422604, "learning_rate": 5.5132070459021914e-08, "loss": 0.0725, "reward": 1.7401042103767395, "reward_std": 0.21508442014455795, "rewards/accuracy_reward": 0.0479166679084301, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7401041805744171, "step": 3030 }, { "clip_ratio": 0.0, "completion_length": 583.108349609375, "epoch": 0.9700752120339254, "grad_norm": 0.18055284023284912, "kl": 0.40517835319042206, "learning_rate": 5.3966337763223795e-08, "loss": 0.1077, "reward": 1.7041666746139525, "reward_std": 0.1730414643883705, "rewards/accuracy_reward": 0.002083333395421505, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7416666865348815, "step": 3031 }, { "clip_ratio": 0.0, "completion_length": 563.9541809082032, "epoch": 0.9703952632421188, "grad_norm": 0.1053120568394661, "kl": 0.226448442786932, "learning_rate": 5.281302810886013e-08, "loss": 0.0701, "reward": 1.7687500357627868, "reward_std": 0.2200807049870491, "rewards/accuracy_reward": 0.07291666939854621, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7354166865348816, "step": 3032 }, { "clip_ratio": 0.0, "completion_length": 579.9541809082032, "epoch": 0.970715314450312, "grad_norm": 0.14212128520011902, "kl": 0.21750407926738263, "learning_rate": 5.1672142936466385e-08, "loss": 0.0557, "reward": 1.7453125238418579, "reward_std": 0.20142756253480912, "rewards/accuracy_reward": 0.0375, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7411458551883697, "step": 3033 }, { "clip_ratio": 0.0, "completion_length": 564.0041931152343, "epoch": 0.9710353656585053, "grad_norm": 0.12605281174182892, "kl": 0.26052655279636383, "learning_rate": 5.054368367106044e-08, "loss": 0.0366, "reward": 1.8239583730697633, "reward_std": 0.18405086249113084, "rewards/accuracy_reward": 0.11250000298023224, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7447916865348816, "step": 3034 }, { "clip_ratio": 0.0, "completion_length": 571.1479339599609, "epoch": 0.9713554168666987, "grad_norm": 0.09858619421720505, "kl": 0.29451265931129456, "learning_rate": 4.9427651722137035e-08, "loss": 0.0872, "reward": 1.8848958730697631, "reward_std": 0.2750785931944847, "rewards/accuracy_reward": 0.1916666753590107, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7369791865348816, "step": 3035 }, { "clip_ratio": 0.0, "completion_length": 575.910433959961, "epoch": 0.971675468074892, "grad_norm": 0.2123694270849228, "kl": 0.23448918834328653, "learning_rate": 4.8324048483670006e-08, "loss": 0.0798, "reward": 1.8343750476837157, "reward_std": 0.2175012208521366, "rewards/accuracy_reward": 0.13541667275130748, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7385416805744172, "step": 3036 }, { "clip_ratio": 0.0, "completion_length": 527.3396026611329, "epoch": 0.9719955192830853, "grad_norm": 0.0967680960893631, "kl": 0.19490599371492862, "learning_rate": 4.723287533411003e-08, "loss": 0.07, "reward": 1.7619791865348815, "reward_std": 0.22343166172504425, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7432291865348816, "step": 3037 }, { "clip_ratio": 0.0, "completion_length": 571.5083618164062, "epoch": 0.9723155704912786, "grad_norm": 0.15740668773651123, "kl": 0.2883405897766352, "learning_rate": 4.615413363638133e-08, "loss": 0.0588, "reward": 1.7901041984558106, "reward_std": 0.24050376191735268, "rewards/accuracy_reward": 0.08125000316649675, "rewards/format_reward": 0.9708333551883698, "rewards/tag_count_reward": 0.7380208551883698, "step": 3038 }, { "clip_ratio": 0.0, "completion_length": 601.1812683105469, "epoch": 0.9726356216994719, "grad_norm": 0.17502444982528687, "kl": 0.32404273599386213, "learning_rate": 4.508782473787943e-08, "loss": 0.1306, "reward": 1.7463541865348815, "reward_std": 0.2457516685128212, "rewards/accuracy_reward": 0.06250000130385161, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7401041805744171, "step": 3039 }, { "clip_ratio": 0.0, "completion_length": 569.497933959961, "epoch": 0.9729556729076653, "grad_norm": 0.09075573831796646, "kl": 0.2926942154765129, "learning_rate": 4.403394997047339e-08, "loss": 0.1007, "reward": 1.8265625476837157, "reward_std": 0.27125475853681563, "rewards/accuracy_reward": 0.13958333991467953, "rewards/format_reward": 0.947916692495346, "rewards/tag_count_reward": 0.7390625238418579, "step": 3040 }, { "clip_ratio": 0.0, "completion_length": 576.8562744140625, "epoch": 0.9732757241158585, "grad_norm": 0.15645340085029602, "kl": 0.46642662212252617, "learning_rate": 4.299251065049803e-08, "loss": 0.136, "reward": 1.7083333730697632, "reward_std": 0.28290791213512423, "rewards/accuracy_reward": 0.04375000055879354, "rewards/format_reward": 0.9354166805744171, "rewards/tag_count_reward": 0.7291666746139527, "step": 3041 }, { "clip_ratio": 0.0, "completion_length": 585.3916931152344, "epoch": 0.9735957753240518, "grad_norm": 0.18267188966274261, "kl": 0.4055576235055923, "learning_rate": 4.1963508078759486e-08, "loss": 0.1059, "reward": 1.7119792103767395, "reward_std": 0.26228521317243575, "rewards/accuracy_reward": 0.058333334513008595, "rewards/format_reward": 0.9250000178813934, "rewards/tag_count_reward": 0.7286458551883698, "step": 3042 }, { "clip_ratio": 0.0, "completion_length": 559.5208618164063, "epoch": 0.9739158265322452, "grad_norm": 0.11391862481832504, "kl": 0.2481472548097372, "learning_rate": 4.094694354052742e-08, "loss": 0.0629, "reward": 1.8135417222976684, "reward_std": 0.14478981345891953, "rewards/accuracy_reward": 0.09583333693444729, "rewards/format_reward": 0.9750000059604644, "rewards/tag_count_reward": 0.7427083492279053, "step": 3043 }, { "clip_ratio": 0.0, "completion_length": 586.6312683105468, "epoch": 0.9742358777404385, "grad_norm": 0.14298422634601593, "kl": 0.3634337313473225, "learning_rate": 3.9942818305537255e-08, "loss": 0.1113, "reward": 1.7713541984558105, "reward_std": 0.23919814825057983, "rewards/accuracy_reward": 0.08541666865348815, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.735937523841858, "step": 3044 }, { "clip_ratio": 0.0, "completion_length": 571.8083557128906, "epoch": 0.9745559289486317, "grad_norm": 0.14067691564559937, "kl": 0.6347122829407453, "learning_rate": 3.895113362798464e-08, "loss": 0.1434, "reward": 1.7317708730697632, "reward_std": 0.28452501222491267, "rewards/accuracy_reward": 0.07083333451300859, "rewards/format_reward": 0.9354166805744171, "rewards/tag_count_reward": 0.7255208492279053, "step": 3045 }, { "clip_ratio": 0.0, "completion_length": 544.6729400634765, "epoch": 0.9748759801568251, "grad_norm": 0.2969621419906616, "kl": 0.24330624416470528, "learning_rate": 3.797189074652874e-08, "loss": 0.102, "reward": 1.7552083492279054, "reward_std": 0.192726993560791, "rewards/accuracy_reward": 0.060416669212281705, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7406250178813935, "step": 3046 }, { "clip_ratio": 0.0, "completion_length": 545.783349609375, "epoch": 0.9751960313650184, "grad_norm": 0.18543684482574463, "kl": 0.343426763266325, "learning_rate": 3.700509088428894e-08, "loss": 0.1255, "reward": 1.829687523841858, "reward_std": 0.23542412966489792, "rewards/accuracy_reward": 0.1416666742414236, "rewards/format_reward": 0.9500000178813934, "rewards/tag_count_reward": 0.7380208492279052, "step": 3047 }, { "clip_ratio": 0.0, "completion_length": 570.0229309082031, "epoch": 0.9755160825732118, "grad_norm": 0.2756257951259613, "kl": 0.3182968482375145, "learning_rate": 3.6050735248841506e-08, "loss": 0.0898, "reward": 1.7703125596046447, "reward_std": 0.24028173089027405, "rewards/accuracy_reward": 0.0791666692122817, "rewards/format_reward": 0.9562500178813934, "rewards/tag_count_reward": 0.7348958492279053, "step": 3048 }, { "clip_ratio": 0.0, "completion_length": 560.6937744140625, "epoch": 0.975836133781405, "grad_norm": 0.19421151280403137, "kl": 0.23446202799677848, "learning_rate": 3.5108825032217355e-08, "loss": 0.0748, "reward": 1.8364583611488343, "reward_std": 0.2093771666288376, "rewards/accuracy_reward": 0.12916667070239782, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7427083611488342, "step": 3049 }, { "clip_ratio": 0.0, "completion_length": 546.8229278564453, "epoch": 0.9761561849895983, "grad_norm": 0.1765829175710678, "kl": 0.239769284427166, "learning_rate": 3.417936141090539e-08, "loss": 0.0545, "reward": 1.792187547683716, "reward_std": 0.21487408950924874, "rewards/accuracy_reward": 0.08541667070239782, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7401041746139526, "step": 3050 }, { "clip_ratio": 0.0, "completion_length": 560.0208557128906, "epoch": 0.9764762361977917, "grad_norm": 0.13331003487110138, "kl": 0.2412990540266037, "learning_rate": 3.326234554584917e-08, "loss": 0.1001, "reward": 1.7531250238418579, "reward_std": 0.21676268726587294, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7385416865348816, "step": 3051 }, { "clip_ratio": 0.0, "completion_length": 538.6000183105468, "epoch": 0.976796287405985, "grad_norm": 0.11980487406253815, "kl": 0.3370642215013504, "learning_rate": 3.235777858244027e-08, "loss": 0.0972, "reward": 1.8760417342185973, "reward_std": 0.20026633143424988, "rewards/accuracy_reward": 0.177083339355886, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7385416865348816, "step": 3052 }, { "clip_ratio": 0.0, "completion_length": 569.733349609375, "epoch": 0.9771163386141782, "grad_norm": 0.18214666843414307, "kl": 0.2942465879023075, "learning_rate": 3.1465661650523785e-08, "loss": 0.1002, "reward": 1.8046875596046448, "reward_std": 0.24582924097776412, "rewards/accuracy_reward": 0.12083333544433117, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7359375119209289, "step": 3053 }, { "clip_ratio": 0.0, "completion_length": 564.4770965576172, "epoch": 0.9774363898223716, "grad_norm": 0.2062879353761673, "kl": 0.21511095613241196, "learning_rate": 3.0585995864395033e-08, "loss": 0.0901, "reward": 1.7583333730697632, "reward_std": 0.20363759249448776, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.9583333492279053, "rewards/tag_count_reward": 0.7458333551883698, "step": 3054 }, { "clip_ratio": 0.0, "completion_length": 575.6312683105468, "epoch": 0.9777564410305649, "grad_norm": 0.25694888830184937, "kl": 0.2298228584229946, "learning_rate": 2.9718782322794015e-08, "loss": 0.0708, "reward": 1.7968750476837159, "reward_std": 0.24236893951892852, "rewards/accuracy_reward": 0.09583333730697632, "rewards/format_reward": 0.9604166924953461, "rewards/tag_count_reward": 0.7406250178813935, "step": 3055 }, { "clip_ratio": 0.0, "completion_length": 556.1375183105469, "epoch": 0.9780764922387583, "grad_norm": 0.06718038767576218, "kl": 0.20514454543590546, "learning_rate": 2.8864022108910927e-08, "loss": 0.0614, "reward": 1.7260416865348815, "reward_std": 0.13063133358955384, "rewards/accuracy_reward": 0.010416667163372039, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7447916865348816, "step": 3056 }, { "clip_ratio": 0.0, "completion_length": 556.6791870117188, "epoch": 0.9783965434469515, "grad_norm": 0.13397450745105743, "kl": 0.20096199810504914, "learning_rate": 2.802171629037953e-08, "loss": 0.0603, "reward": 1.8041666984558105, "reward_std": 0.21762454360723496, "rewards/accuracy_reward": 0.1125000037252903, "rewards/format_reward": 0.9520833492279053, "rewards/tag_count_reward": 0.7395833492279053, "step": 3057 }, { "clip_ratio": 0.0, "completion_length": 579.3729400634766, "epoch": 0.9787165946551448, "grad_norm": 0.17857791483402252, "kl": 0.3088830351829529, "learning_rate": 2.719186591927603e-08, "loss": 0.1031, "reward": 1.7270833730697632, "reward_std": 0.23266912549734114, "rewards/accuracy_reward": 0.03750000111758709, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7395833432674408, "step": 3058 }, { "clip_ratio": 0.0, "completion_length": 593.3021087646484, "epoch": 0.9790366458633382, "grad_norm": 0.15610116720199585, "kl": 0.23165589943528175, "learning_rate": 2.637447203212129e-08, "loss": 0.0853, "reward": 1.8005208849906922, "reward_std": 0.22202980518341064, "rewards/accuracy_reward": 0.0979166692122817, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7401041805744171, "step": 3059 }, { "clip_ratio": 0.0, "completion_length": 581.0396087646484, "epoch": 0.9793566970715314, "grad_norm": 0.24467244744300842, "kl": 0.2348164737224579, "learning_rate": 2.556953564987752e-08, "loss": 0.0722, "reward": 1.8072917342185975, "reward_std": 0.18154692202806472, "rewards/accuracy_reward": 0.09583333637565375, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.744791692495346, "step": 3060 }, { "clip_ratio": 0.0, "completion_length": 590.1250183105469, "epoch": 0.9796767482797247, "grad_norm": 0.14823976159095764, "kl": 0.28711467459797857, "learning_rate": 2.4777057777946034e-08, "loss": 0.1042, "reward": 1.6937500357627868, "reward_std": 0.2312575563788414, "rewards/accuracy_reward": 0.008333333395421505, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7354166984558106, "step": 3061 }, { "clip_ratio": 0.0, "completion_length": 566.2458557128906, "epoch": 0.9799967994879181, "grad_norm": 0.13439303636550903, "kl": 0.21451763063669205, "learning_rate": 2.3997039406167266e-08, "loss": 0.0484, "reward": 1.7958333849906922, "reward_std": 0.13553692996501923, "rewards/accuracy_reward": 0.0791666692122817, "rewards/format_reward": 0.9708333492279053, "rewards/tag_count_reward": 0.7458333492279052, "step": 3062 }, { "clip_ratio": 0.0, "completion_length": 595.1854309082031, "epoch": 0.9803168506961114, "grad_norm": 0.12152549624443054, "kl": 0.36722201108932495, "learning_rate": 2.322948150881854e-08, "loss": 0.1193, "reward": 1.707812535762787, "reward_std": 0.26899106055498123, "rewards/accuracy_reward": 0.052083334513008596, "rewards/format_reward": 0.9250000238418579, "rewards/tag_count_reward": 0.7307291865348816, "step": 3063 }, { "clip_ratio": 0.0, "completion_length": 553.4958404541015, "epoch": 0.9806369019043046, "grad_norm": 0.1406739056110382, "kl": 0.2821358598768711, "learning_rate": 2.2474385044615188e-08, "loss": 0.0637, "reward": 1.8052083611488343, "reward_std": 0.23521801978349685, "rewards/accuracy_reward": 0.10416666883975267, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7385416805744172, "step": 3064 }, { "clip_ratio": 0.0, "completion_length": 565.8479370117187, "epoch": 0.980956953112498, "grad_norm": 0.15969295799732208, "kl": 0.3610251784324646, "learning_rate": 2.173175095670499e-08, "loss": 0.122, "reward": 1.7802083611488342, "reward_std": 0.2173793375492096, "rewards/accuracy_reward": 0.08958333563059569, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.7427083551883698, "step": 3065 }, { "clip_ratio": 0.0, "completion_length": 585.2812683105469, "epoch": 0.9812770043206913, "grad_norm": 0.21414420008659363, "kl": 0.3591214381158352, "learning_rate": 2.100158017267151e-08, "loss": 0.0997, "reward": 1.7401041984558105, "reward_std": 0.2544398784637451, "rewards/accuracy_reward": 0.06041666846722364, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7380208551883698, "step": 3066 }, { "clip_ratio": 0.0, "completion_length": 571.1646057128906, "epoch": 0.9815970555288847, "grad_norm": 0.19221711158752441, "kl": 0.2868764579296112, "learning_rate": 2.028387360453188e-08, "loss": 0.0974, "reward": 1.7140625238418579, "reward_std": 0.22124834954738617, "rewards/accuracy_reward": 0.02916666679084301, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7390625178813934, "step": 3067 }, { "clip_ratio": 0.0, "completion_length": 593.27294921875, "epoch": 0.9819171067370779, "grad_norm": 0.1753731369972229, "kl": 0.30784842520952227, "learning_rate": 1.9578632148733455e-08, "loss": 0.1143, "reward": 1.6729167103767395, "reward_std": 0.24664187729358672, "rewards/accuracy_reward": 0.006250000186264515, "rewards/format_reward": 0.9333333551883698, "rewards/tag_count_reward": 0.7333333551883697, "step": 3068 }, { "clip_ratio": 0.0, "completion_length": 563.4666870117187, "epoch": 0.9822371579452712, "grad_norm": 0.24415169656276703, "kl": 0.34099898114800453, "learning_rate": 1.8885856686152725e-08, "loss": 0.0821, "reward": 1.726562535762787, "reward_std": 0.21097205057740212, "rewards/accuracy_reward": 0.04583333432674408, "rewards/format_reward": 0.9479166865348816, "rewards/tag_count_reward": 0.732812511920929, "step": 3069 }, { "clip_ratio": 0.0, "completion_length": 533.0958435058594, "epoch": 0.9825572091534646, "grad_norm": 0.09442782402038574, "kl": 0.325906627625227, "learning_rate": 1.8205548082099733e-08, "loss": 0.0564, "reward": 1.7869791984558105, "reward_std": 0.1821589708328247, "rewards/accuracy_reward": 0.0770833358168602, "rewards/format_reward": 0.9687500119209289, "rewards/tag_count_reward": 0.7411458492279053, "step": 3070 }, { "clip_ratio": 0.0, "completion_length": 561.789599609375, "epoch": 0.9828772603616579, "grad_norm": 0.253113329410553, "kl": 0.24712217450141907, "learning_rate": 1.7537707186308093e-08, "loss": 0.1244, "reward": 1.801562535762787, "reward_std": 0.2580643087625504, "rewards/accuracy_reward": 0.12291667275130749, "rewards/format_reward": 0.9458333551883698, "rewards/tag_count_reward": 0.732812511920929, "step": 3071 }, { "clip_ratio": 0.0, "completion_length": 591.6104309082032, "epoch": 0.9831973115698511, "grad_norm": 0.1100858673453331, "kl": 0.22945720814168452, "learning_rate": 1.6882334832942772e-08, "loss": 0.0992, "reward": 1.8088541984558106, "reward_std": 0.20694586411118507, "rewards/accuracy_reward": 0.10833333563059569, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7442708551883698, "step": 3072 }, { "clip_ratio": 0.0, "completion_length": 569.0666870117187, "epoch": 0.9835173627780445, "grad_norm": 0.12897634506225586, "kl": 0.30970082357525824, "learning_rate": 1.623943184059229e-08, "loss": 0.1166, "reward": 1.7489583849906922, "reward_std": 0.21749227941036225, "rewards/accuracy_reward": 0.0687500013038516, "rewards/format_reward": 0.9416666865348816, "rewards/tag_count_reward": 0.7385416805744172, "step": 3073 }, { "clip_ratio": 0.0, "completion_length": 563.2000183105469, "epoch": 0.9838374139862378, "grad_norm": 0.24033485352993011, "kl": 0.3132885962724686, "learning_rate": 1.5608999012272085e-08, "loss": 0.0843, "reward": 1.7281250357627869, "reward_std": 0.16667985767126084, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7343750178813935, "step": 3074 }, { "clip_ratio": 0.0, "completion_length": 528.7500152587891, "epoch": 0.9841574651944311, "grad_norm": 0.07913769036531448, "kl": 0.2783251881599426, "learning_rate": 1.499103713542005e-08, "loss": 0.0482, "reward": 1.7885416746139526, "reward_std": 0.1665610209107399, "rewards/accuracy_reward": 0.07291666921228171, "rewards/format_reward": 0.9750000059604644, "rewards/tag_count_reward": 0.740625011920929, "step": 3075 }, { "clip_ratio": 0.0, "completion_length": 565.0833557128906, "epoch": 0.9844775164026244, "grad_norm": 0.17048057913780212, "kl": 0.34632964730262755, "learning_rate": 1.4385546981897647e-08, "loss": 0.1022, "reward": 1.7401042103767395, "reward_std": 0.21267496347427367, "rewards/accuracy_reward": 0.045833334885537626, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7380208551883698, "step": 3076 }, { "clip_ratio": 0.0, "completion_length": 588.2833526611328, "epoch": 0.9847975676108177, "grad_norm": 0.13539846241474152, "kl": 0.30142875015735626, "learning_rate": 1.379252930799102e-08, "loss": 0.0594, "reward": 1.7718750476837157, "reward_std": 0.22751090675592422, "rewards/accuracy_reward": 0.08333333730697631, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7343750178813935, "step": 3077 }, { "clip_ratio": 0.0, "completion_length": 586.7583557128906, "epoch": 0.985117618819011, "grad_norm": 0.3082070052623749, "kl": 0.3032428666949272, "learning_rate": 1.3211984854404337e-08, "loss": 0.1176, "reward": 1.7770833730697633, "reward_std": 0.22418717592954635, "rewards/accuracy_reward": 0.08333333451300859, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7395833551883697, "step": 3078 }, { "clip_ratio": 0.0, "completion_length": 568.8271057128907, "epoch": 0.9854376700272044, "grad_norm": 0.17651331424713135, "kl": 0.32943628504872324, "learning_rate": 1.264391434626533e-08, "loss": 0.1171, "reward": 1.7630208730697632, "reward_std": 0.19876088947057724, "rewards/accuracy_reward": 0.06666667070239782, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7380208551883698, "step": 3079 }, { "clip_ratio": 0.0, "completion_length": 562.633349609375, "epoch": 0.9857577212353976, "grad_norm": 0.12131522595882416, "kl": 0.22669636681675912, "learning_rate": 1.2088318493117534e-08, "loss": 0.0728, "reward": 1.734375035762787, "reward_std": 0.17790770083665847, "rewards/accuracy_reward": 0.025000000186264516, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7427083551883698, "step": 3080 }, { "clip_ratio": 0.0, "completion_length": 566.4125152587891, "epoch": 0.986077772443591, "grad_norm": 0.16687986254692078, "kl": 0.30181434378027916, "learning_rate": 1.1545197988925839e-08, "loss": 0.0896, "reward": 1.792187547683716, "reward_std": 0.17845623940229416, "rewards/accuracy_reward": 0.10000000298023223, "rewards/format_reward": 0.9541666865348816, "rewards/tag_count_reward": 0.7380208551883698, "step": 3081 }, { "clip_ratio": 0.0, "completion_length": 582.6791931152344, "epoch": 0.9863978236517843, "grad_norm": 0.166269451379776, "kl": 0.38384261056780816, "learning_rate": 1.1014553512072036e-08, "loss": 0.0944, "reward": 1.7432292222976684, "reward_std": 0.21463448256254197, "rewards/accuracy_reward": 0.05416666828095913, "rewards/format_reward": 0.950000011920929, "rewards/tag_count_reward": 0.7390625238418579, "step": 3082 }, { "clip_ratio": 0.0, "completion_length": 581.1666748046875, "epoch": 0.9867178748599776, "grad_norm": 0.26047223806381226, "kl": 0.3793337717652321, "learning_rate": 1.049638572535483e-08, "loss": 0.1121, "reward": 1.7692708611488341, "reward_std": 0.21841761842370033, "rewards/accuracy_reward": 0.07916666865348816, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7359375178813934, "step": 3083 }, { "clip_ratio": 0.0, "completion_length": 558.1937683105468, "epoch": 0.9870379260681709, "grad_norm": 0.2953816056251526, "kl": 0.26855692490935323, "learning_rate": 9.990695275988727e-09, "loss": 0.075, "reward": 1.8380208730697631, "reward_std": 0.19789665341377258, "rewards/accuracy_reward": 0.13125000447034835, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7380208551883698, "step": 3084 }, { "clip_ratio": 0.0, "completion_length": 565.4000305175781, "epoch": 0.9873579772763642, "grad_norm": 0.13979560136795044, "kl": 0.3356237094849348, "learning_rate": 9.49748279560514e-09, "loss": 0.0903, "reward": 1.8380208611488342, "reward_std": 0.2137792520225048, "rewards/accuracy_reward": 0.13750000204890966, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7380208551883698, "step": 3085 }, { "clip_ratio": 0.0, "completion_length": 574.4916870117188, "epoch": 0.9876780284845575, "grad_norm": 0.19319754838943481, "kl": 0.3143337398767471, "learning_rate": 9.01674890024684e-09, "loss": 0.0927, "reward": 1.7260417222976685, "reward_std": 0.23539431765675545, "rewards/accuracy_reward": 0.03541666716337204, "rewards/format_reward": 0.9479166984558105, "rewards/tag_count_reward": 0.7427083551883698, "step": 3086 }, { "clip_ratio": 0.0, "completion_length": 580.2896118164062, "epoch": 0.9879980796927509, "grad_norm": 0.13193345069885254, "kl": 0.33816151022911073, "learning_rate": 8.548494190372402e-09, "loss": 0.1099, "reward": 1.7890625715255737, "reward_std": 0.2728777229785919, "rewards/accuracy_reward": 0.11458333805203438, "rewards/format_reward": 0.9395833551883698, "rewards/tag_count_reward": 0.7348958551883698, "step": 3087 }, { "clip_ratio": 0.0, "completion_length": 557.3250244140625, "epoch": 0.9883181309009441, "grad_norm": 0.10972767323255539, "kl": 0.2844900615513325, "learning_rate": 8.092719250853975e-09, "loss": 0.0916, "reward": 1.7411458492279053, "reward_std": 0.14775414243340493, "rewards/accuracy_reward": 0.03333333432674408, "rewards/format_reward": 0.9687500178813935, "rewards/tag_count_reward": 0.7390625178813934, "step": 3088 }, { "clip_ratio": 0.0, "completion_length": 531.6646057128906, "epoch": 0.9886381821091375, "grad_norm": 0.15878772735595703, "kl": 0.34863837584853175, "learning_rate": 7.649424650972847e-09, "loss": 0.1037, "reward": 1.8317708730697633, "reward_std": 0.22507388815283774, "rewards/accuracy_reward": 0.13125000428408384, "rewards/format_reward": 0.9604166865348815, "rewards/tag_count_reward": 0.7401041865348816, "step": 3089 }, { "clip_ratio": 0.0, "completion_length": 551.4354278564454, "epoch": 0.9889582333173308, "grad_norm": 0.14936627447605133, "kl": 0.2459190659224987, "learning_rate": 7.218610944426108e-09, "loss": 0.0887, "reward": 1.7484375357627868, "reward_std": 0.19498306661844253, "rewards/accuracy_reward": 0.0479166692122817, "rewards/format_reward": 0.9604166805744171, "rewards/tag_count_reward": 0.7401041984558105, "step": 3090 }, { "clip_ratio": 0.0, "completion_length": 566.904183959961, "epoch": 0.9892782845255241, "grad_norm": 0.11002171039581299, "kl": 0.49073898121714593, "learning_rate": 6.800278669317762e-09, "loss": 0.1155, "reward": 1.7229166865348815, "reward_std": 0.24243892431259156, "rewards/accuracy_reward": 0.039583333395421506, "rewards/format_reward": 0.9500000238418579, "rewards/tag_count_reward": 0.7333333551883697, "step": 3091 }, { "clip_ratio": 0.0, "completion_length": 568.7021026611328, "epoch": 0.9895983357337174, "grad_norm": 0.11355835944414139, "kl": 0.33552836179733275, "learning_rate": 6.394428348164284e-09, "loss": 0.0884, "reward": 1.7526041984558105, "reward_std": 0.18140390366315842, "rewards/accuracy_reward": 0.054166667722165586, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7359375178813934, "step": 3092 }, { "clip_ratio": 0.0, "completion_length": 580.3541900634766, "epoch": 0.9899183869419107, "grad_norm": 0.19902049005031586, "kl": 0.24995511323213576, "learning_rate": 6.001060487891286e-09, "loss": 0.0791, "reward": 1.7546875476837158, "reward_std": 0.2084854982793331, "rewards/accuracy_reward": 0.06041666865348816, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7380208492279052, "step": 3093 }, { "clip_ratio": 0.0, "completion_length": 569.2750244140625, "epoch": 0.990238438150104, "grad_norm": 0.17721405625343323, "kl": 0.2684307098388672, "learning_rate": 5.6201755798313e-09, "loss": 0.0578, "reward": 1.7473958492279054, "reward_std": 0.12207645624876022, "rewards/accuracy_reward": 0.03541666772216558, "rewards/format_reward": 0.9729166865348816, "rewards/tag_count_reward": 0.739062511920929, "step": 3094 }, { "clip_ratio": 0.0, "completion_length": 577.2583435058593, "epoch": 0.9905584893582974, "grad_norm": 0.11798027902841568, "kl": 0.22919094637036325, "learning_rate": 5.251774099727103e-09, "loss": 0.0895, "reward": 1.8020833730697632, "reward_std": 0.20591284781694413, "rewards/accuracy_reward": 0.0958333358168602, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7416666924953461, "step": 3095 }, { "clip_ratio": 0.0, "completion_length": 568.3979400634765, "epoch": 0.9908785405664906, "grad_norm": 0.19848725199699402, "kl": 0.4313486650586128, "learning_rate": 4.895856507730612e-09, "loss": 0.1078, "reward": 1.7447916746139527, "reward_std": 0.23529223948717118, "rewards/accuracy_reward": 0.0708333333954215, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7364583492279053, "step": 3096 }, { "clip_ratio": 0.0, "completion_length": 599.0312744140625, "epoch": 0.991198591774684, "grad_norm": 0.29588741064071655, "kl": 0.3257336333394051, "learning_rate": 4.55242324839622e-09, "loss": 0.1146, "reward": 1.7567708611488342, "reward_std": 0.28883601576089857, "rewards/accuracy_reward": 0.08541666883975267, "rewards/format_reward": 0.9375000178813935, "rewards/tag_count_reward": 0.7338541924953461, "step": 3097 }, { "clip_ratio": 0.0, "completion_length": 560.9854370117188, "epoch": 0.9915186429828773, "grad_norm": 0.1464318186044693, "kl": 0.31563855409622193, "learning_rate": 4.22147475068968e-09, "loss": 0.0892, "reward": 1.8328125596046447, "reward_std": 0.22565954253077508, "rewards/accuracy_reward": 0.1291666690260172, "rewards/format_reward": 0.9645833611488343, "rewards/tag_count_reward": 0.7390625178813934, "step": 3098 }, { "clip_ratio": 0.0, "completion_length": 551.0625091552735, "epoch": 0.9918386941910706, "grad_norm": 0.2843438386917114, "kl": 0.24993645697832106, "learning_rate": 3.903011427978109e-09, "loss": 0.0968, "reward": 1.7885417222976685, "reward_std": 0.22308254763484, "rewards/accuracy_reward": 0.0916666692122817, "rewards/format_reward": 0.9562500238418579, "rewards/tag_count_reward": 0.7406250238418579, "step": 3099 }, { "clip_ratio": 0.0, "completion_length": 565.564599609375, "epoch": 0.9921587453992639, "grad_norm": 0.1468961387872696, "kl": 0.2920463755726814, "learning_rate": 3.597033678038875e-09, "loss": 0.0485, "reward": 1.8338542103767395, "reward_std": 0.1874682992696762, "rewards/accuracy_reward": 0.12500000298023223, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7421875178813935, "step": 3100 }, { "clip_ratio": 0.0, "completion_length": 557.4166870117188, "epoch": 0.9924787966074572, "grad_norm": 0.15643829107284546, "kl": 0.31074003875255585, "learning_rate": 3.303541883049599e-09, "loss": 0.1386, "reward": 1.7671875596046447, "reward_std": 0.2778718054294586, "rewards/accuracy_reward": 0.08541667088866234, "rewards/format_reward": 0.9500000298023223, "rewards/tag_count_reward": 0.7317708492279053, "step": 3101 }, { "clip_ratio": 0.0, "completion_length": 591.9021057128906, "epoch": 0.9927988478156505, "grad_norm": 0.1353743076324463, "kl": 0.2804649338126183, "learning_rate": 3.0225364095970432e-09, "loss": 0.1057, "reward": 1.756250023841858, "reward_std": 0.2645873501896858, "rewards/accuracy_reward": 0.07916666679084301, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.7333333432674408, "step": 3102 }, { "clip_ratio": 0.0, "completion_length": 569.1333526611328, "epoch": 0.9931188990238438, "grad_norm": 0.0932333841919899, "kl": 0.25905941873788835, "learning_rate": 2.7540176086671145e-09, "loss": 0.1129, "reward": 1.7427083611488343, "reward_std": 0.1830264799296856, "rewards/accuracy_reward": 0.041666666977107525, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7385416865348816, "step": 3103 }, { "clip_ratio": 0.0, "completion_length": 551.8979431152344, "epoch": 0.9934389502320371, "grad_norm": 0.17291957139968872, "kl": 0.2739930372685194, "learning_rate": 2.4979858156537474e-09, "loss": 0.1074, "reward": 1.7869792103767395, "reward_std": 0.22782448977231978, "rewards/accuracy_reward": 0.09166667088866234, "rewards/format_reward": 0.954166692495346, "rewards/tag_count_reward": 0.7411458551883697, "step": 3104 }, { "clip_ratio": 0.0, "completion_length": 583.5666870117187, "epoch": 0.9937590014402304, "grad_norm": 0.14900268614292145, "kl": 0.22179678678512574, "learning_rate": 2.2544413503522432e-09, "loss": 0.0685, "reward": 1.8458333730697631, "reward_std": 0.23180068656802177, "rewards/accuracy_reward": 0.14375000465661286, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7395833551883697, "step": 3105 }, { "clip_ratio": 0.0, "completion_length": 554.6395965576172, "epoch": 0.9940790526484238, "grad_norm": 0.16432535648345947, "kl": 0.378567086905241, "learning_rate": 2.02338451695816e-09, "loss": 0.0833, "reward": 1.7901042222976684, "reward_std": 0.22208391055464743, "rewards/accuracy_reward": 0.08541666939854622, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7401041805744171, "step": 3106 }, { "clip_ratio": 0.0, "completion_length": 564.2937744140625, "epoch": 0.994399103856617, "grad_norm": 0.1785258948802948, "kl": 0.21541684567928315, "learning_rate": 1.804815604075083e-09, "loss": 0.0671, "reward": 1.788020873069763, "reward_std": 0.16471139043569566, "rewards/accuracy_reward": 0.07500000335276127, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7421875178813935, "step": 3107 }, { "clip_ratio": 0.0, "completion_length": 581.3437713623047, "epoch": 0.9947191550648103, "grad_norm": 0.2913496196269989, "kl": 0.38024489805102346, "learning_rate": 1.5987348847024132e-09, "loss": 0.1083, "reward": 1.8026042103767395, "reward_std": 0.24502479285001755, "rewards/accuracy_reward": 0.11666666865348815, "rewards/format_reward": 0.9520833551883697, "rewards/tag_count_reward": 0.7338541805744171, "step": 3108 }, { "clip_ratio": 0.0, "completion_length": 554.1750183105469, "epoch": 0.9950392062730037, "grad_norm": 0.17706918716430664, "kl": 0.24948984608054162, "learning_rate": 1.4051426162464687e-09, "loss": 0.0613, "reward": 1.7614583611488341, "reward_std": 0.1818772867321968, "rewards/accuracy_reward": 0.0520833358168602, "rewards/format_reward": 0.9666666924953461, "rewards/tag_count_reward": 0.7427083492279053, "step": 3109 }, { "clip_ratio": 0.0, "completion_length": 592.2291870117188, "epoch": 0.995359257481197, "grad_norm": 0.16352076828479767, "kl": 0.3233878821134567, "learning_rate": 1.2240390405116043e-09, "loss": 0.0913, "reward": 1.756250023841858, "reward_std": 0.23520760014653205, "rewards/accuracy_reward": 0.07708333563059569, "rewards/format_reward": 0.9437500178813935, "rewards/tag_count_reward": 0.735416692495346, "step": 3110 }, { "clip_ratio": 0.0, "completion_length": 582.6187622070313, "epoch": 0.9956793086893903, "grad_norm": 0.3277549147605896, "kl": 0.3297587588429451, "learning_rate": 1.0554243837035404e-09, "loss": 0.1324, "reward": 1.7932292222976685, "reward_std": 0.23931009843945503, "rewards/accuracy_reward": 0.11041666865348816, "rewards/format_reward": 0.9437500238418579, "rewards/tag_count_reward": 0.7390625178813934, "step": 3111 }, { "clip_ratio": 0.0, "completion_length": 578.1791748046875, "epoch": 0.9959993598975836, "grad_norm": 0.1446753889322281, "kl": 0.25720045566558836, "learning_rate": 8.992988564315852e-10, "loss": 0.0708, "reward": 1.7598958730697631, "reward_std": 0.20648740902543067, "rewards/accuracy_reward": 0.06250000037252904, "rewards/format_reward": 0.9583333551883697, "rewards/tag_count_reward": 0.7390625178813934, "step": 3112 }, { "clip_ratio": 0.0, "completion_length": 569.9125061035156, "epoch": 0.9963194111057769, "grad_norm": 0.14114601910114288, "kl": 0.2895710654556751, "learning_rate": 7.556626537019717e-10, "loss": 0.0735, "reward": 1.764062523841858, "reward_std": 0.16610406339168549, "rewards/accuracy_reward": 0.05625000186264515, "rewards/format_reward": 0.9666666805744171, "rewards/tag_count_reward": 0.7411458611488342, "step": 3113 }, { "clip_ratio": 0.0, "completion_length": 571.2562683105468, "epoch": 0.9966394623139703, "grad_norm": 0.15220247209072113, "kl": 0.25976728796958926, "learning_rate": 6.245159549223001e-10, "loss": 0.0824, "reward": 1.8223958730697631, "reward_std": 0.14715693891048431, "rewards/accuracy_reward": 0.10416667088866234, "rewards/format_reward": 0.9729166805744172, "rewards/tag_count_reward": 0.745312511920929, "step": 3114 }, { "clip_ratio": 0.0, "completion_length": 582.3187683105468, "epoch": 0.9969595135221635, "grad_norm": 0.12920396029949188, "kl": 0.2570869214832783, "learning_rate": 5.058589239026468e-10, "loss": 0.0691, "reward": 1.8505208849906922, "reward_std": 0.22681027501821518, "rewards/accuracy_reward": 0.14375000465661286, "rewards/format_reward": 0.9645833551883698, "rewards/tag_count_reward": 0.7421875238418579, "step": 3115 }, { "clip_ratio": 0.0, "completion_length": 553.4187683105469, "epoch": 0.9972795647303568, "grad_norm": 0.17800143361091614, "kl": 0.3351675134152174, "learning_rate": 3.9969170884890384e-10, "loss": 0.0882, "reward": 1.8515625238418578, "reward_std": 0.20919253826141357, "rewards/accuracy_reward": 0.14375000353902578, "rewards/format_reward": 0.9666666865348816, "rewards/tag_count_reward": 0.7411458492279053, "step": 3116 }, { "clip_ratio": 0.0, "completion_length": 548.8896057128907, "epoch": 0.9975996159385502, "grad_norm": 0.20195423066616058, "kl": 0.3757719676941633, "learning_rate": 3.0601444236944e-10, "loss": 0.0951, "reward": 1.7546875476837158, "reward_std": 0.21815839111804963, "rewards/accuracy_reward": 0.07291666883975267, "rewards/format_reward": 0.9458333492279053, "rewards/tag_count_reward": 0.7359375178813934, "step": 3117 }, { "clip_ratio": 0.0, "completion_length": 556.9270965576172, "epoch": 0.9979196671467435, "grad_norm": 0.17428921163082123, "kl": 0.24478441402316092, "learning_rate": 2.2482724147177005e-10, "loss": 0.0998, "reward": 1.772395873069763, "reward_std": 0.21767098605632781, "rewards/accuracy_reward": 0.07083333488553763, "rewards/format_reward": 0.962500023841858, "rewards/tag_count_reward": 0.7390625178813934, "step": 3118 }, { "clip_ratio": 0.0, "completion_length": 570.952099609375, "epoch": 0.9982397183549367, "grad_norm": 0.19090551137924194, "kl": 0.2739730294793844, "learning_rate": 1.561302075625548e-10, "loss": 0.0562, "reward": 1.8151041865348816, "reward_std": 0.19158529341220856, "rewards/accuracy_reward": 0.10833333656191826, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7442708492279053, "step": 3119 }, { "clip_ratio": 0.0, "completion_length": 560.8687713623046, "epoch": 0.9985597695631301, "grad_norm": 0.2590288817882538, "kl": 0.24943210408091546, "learning_rate": 9.9923426446491e-11, "loss": 0.0792, "reward": 1.8656250476837157, "reward_std": 0.2500589728355408, "rewards/accuracy_reward": 0.16041667014360428, "rewards/format_reward": 0.9645833492279052, "rewards/tag_count_reward": 0.7406250178813935, "step": 3120 }, { "clip_ratio": 0.0, "completion_length": 548.5166900634765, "epoch": 0.9988798207713234, "grad_norm": 0.16108450293540955, "kl": 0.1967288039624691, "learning_rate": 5.620696832964179e-11, "loss": 0.087, "reward": 1.8333333730697632, "reward_std": 0.1661988228559494, "rewards/accuracy_reward": 0.11666666977107525, "rewards/format_reward": 0.9708333432674408, "rewards/tag_count_reward": 0.7458333492279052, "step": 3121 }, { "clip_ratio": 0.0, "completion_length": 601.1583557128906, "epoch": 0.9991998719795168, "grad_norm": 0.10591613501310349, "kl": 0.4319122813642025, "learning_rate": 2.4980887813885745e-11, "loss": 0.0541, "reward": 1.7588541984558106, "reward_std": 0.18204645216464996, "rewards/accuracy_reward": 0.06666666772216559, "rewards/format_reward": 0.9541666805744171, "rewards/tag_count_reward": 0.7380208492279052, "step": 3122 }, { "clip_ratio": 0.0, "completion_length": 575.2312774658203, "epoch": 0.99951992318771, "grad_norm": 0.13070808351039886, "kl": 0.35216558873653414, "learning_rate": 6.245223903578179e-12, "loss": 0.1193, "reward": 1.7572917103767396, "reward_std": 0.24927352666854857, "rewards/accuracy_reward": 0.07708333637565375, "rewards/format_reward": 0.9395833611488342, "rewards/tag_count_reward": 0.7406250178813935, "step": 3123 }, { "clip_ratio": 0.0, "completion_length": 572.9297424316406, "epoch": 0.9998399743959033, "grad_norm": 0.24119921028614044, "kl": 0.23806431293487548, "learning_rate": 0.0, "loss": 0.072, "reward": 1.7848958730697633, "reward_std": 0.2171033151447773, "rewards/accuracy_reward": 0.08125000149011612, "rewards/format_reward": 0.9625000178813934, "rewards/tag_count_reward": 0.7411458551883697, "step": 3124 }, { "epoch": 0.9998399743959033, "step": 3124, "total_flos": 0.0, "train_loss": 0.07466813361974993, "train_runtime": 412723.7185, "train_samples_per_second": 0.227, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 3124, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }